1e16d56d8c38ac51bd999038ae4e8478bf2f5f8c
[sfrench/cifs-2.6.git] / net / ipv6 / ip6_output.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *      IPv6 output functions
4  *      Linux INET6 implementation
5  *
6  *      Authors:
7  *      Pedro Roque             <roque@di.fc.ul.pt>
8  *
9  *      Based on linux/net/ipv4/ip_output.c
10  *
11  *      Changes:
12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
13  *                              extension headers are implemented.
14  *                              route changes now work.
15  *                              ip6_forward does not confuse sniffers.
16  *                              etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *      Imran Patel     :       frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *                      :       add ip6_append_data and related functions
22  *                              for datagram xmit
23  */
24
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41
42 #include <net/sock.h>
43 #include <net/snmp.h>
44
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62         struct dst_entry *dst = skb_dst(skb);
63         struct net_device *dev = dst->dev;
64         struct inet6_dev *idev = ip6_dst_idev(dst);
65         unsigned int hh_len = LL_RESERVED_SPACE(dev);
66         const struct in6_addr *daddr, *nexthop;
67         struct ipv6hdr *hdr;
68         struct neighbour *neigh;
69         int ret;
70
71         /* Be paranoid, rather than too clever. */
72         if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73                 skb = skb_expand_head(skb, hh_len);
74                 if (!skb) {
75                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
76                         return -ENOMEM;
77                 }
78         }
79
80         hdr = ipv6_hdr(skb);
81         daddr = &hdr->daddr;
82         if (ipv6_addr_is_multicast(daddr)) {
83                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
84                     ((mroute6_is_socket(net, skb) &&
85                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
86                      ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
87                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88
89                         /* Do not check for IFF_ALLMULTI; multicast routing
90                            is not supported in any case.
91                          */
92                         if (newskb)
93                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
94                                         net, sk, newskb, NULL, newskb->dev,
95                                         dev_loopback_xmit);
96
97                         if (hdr->hop_limit == 0) {
98                                 IP6_INC_STATS(net, idev,
99                                               IPSTATS_MIB_OUTDISCARDS);
100                                 kfree_skb(skb);
101                                 return 0;
102                         }
103                 }
104
105                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
106                 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
107                     !(dev->flags & IFF_LOOPBACK)) {
108                         kfree_skb(skb);
109                         return 0;
110                 }
111         }
112
113         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
114                 int res = lwtunnel_xmit(skb);
115
116                 if (res != LWTUNNEL_XMIT_CONTINUE)
117                         return res;
118         }
119
120         rcu_read_lock();
121         nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
122         neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
123
124         if (unlikely(IS_ERR_OR_NULL(neigh))) {
125                 if (unlikely(!neigh))
126                         neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
127                 if (IS_ERR(neigh)) {
128                         rcu_read_unlock();
129                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
130                         kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
131                         return -EINVAL;
132                 }
133         }
134         sock_confirm_neigh(skb, neigh);
135         ret = neigh_output(neigh, skb, false);
136         rcu_read_unlock();
137         return ret;
138 }
139
140 static int
141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142                                     struct sk_buff *skb, unsigned int mtu)
143 {
144         struct sk_buff *segs, *nskb;
145         netdev_features_t features;
146         int ret = 0;
147
148         /* Please see corresponding comment in ip_finish_output_gso
149          * describing the cases where GSO segment length exceeds the
150          * egress MTU.
151          */
152         features = netif_skb_features(skb);
153         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154         if (IS_ERR_OR_NULL(segs)) {
155                 kfree_skb(skb);
156                 return -ENOMEM;
157         }
158
159         consume_skb(skb);
160
161         skb_list_walk_safe(segs, segs, nskb) {
162                 int err;
163
164                 skb_mark_not_on_list(segs);
165                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
166                 if (err && ret == 0)
167                         ret = err;
168         }
169
170         return ret;
171 }
172
173 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
174 {
175         unsigned int mtu;
176
177 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
178         /* Policy lookup after SNAT yielded a new policy */
179         if (skb_dst(skb)->xfrm) {
180                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
181                 return dst_output(net, sk, skb);
182         }
183 #endif
184
185         mtu = ip6_skb_dst_mtu(skb);
186         if (skb_is_gso(skb) &&
187             !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
188             !skb_gso_validate_network_len(skb, mtu))
189                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
190
191         if ((skb->len > mtu && !skb_is_gso(skb)) ||
192             dst_allfrag(skb_dst(skb)) ||
193             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
194                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
195         else
196                 return ip6_finish_output2(net, sk, skb);
197 }
198
199 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
200 {
201         int ret;
202
203         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
204         switch (ret) {
205         case NET_XMIT_SUCCESS:
206         case NET_XMIT_CN:
207                 return __ip6_finish_output(net, sk, skb) ? : ret;
208         default:
209                 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
210                 return ret;
211         }
212 }
213
214 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
215 {
216         struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
217         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
218
219         skb->protocol = htons(ETH_P_IPV6);
220         skb->dev = dev;
221
222         if (unlikely(idev->cnf.disable_ipv6)) {
223                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
224                 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
225                 return 0;
226         }
227
228         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
229                             net, sk, skb, indev, dev,
230                             ip6_finish_output,
231                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
232 }
233 EXPORT_SYMBOL(ip6_output);
234
235 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
236 {
237         if (!np->autoflowlabel_set)
238                 return ip6_default_np_autolabel(net);
239         else
240                 return np->autoflowlabel;
241 }
242
243 /*
244  * xmit an sk_buff (used by TCP, SCTP and DCCP)
245  * Note : socket lock is not held for SYNACK packets, but might be modified
246  * by calls to skb_set_owner_w() and ipv6_local_error(),
247  * which are using proper atomic operations or spinlocks.
248  */
249 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
250              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
251 {
252         struct net *net = sock_net(sk);
253         const struct ipv6_pinfo *np = inet6_sk(sk);
254         struct in6_addr *first_hop = &fl6->daddr;
255         struct dst_entry *dst = skb_dst(skb);
256         struct net_device *dev = dst->dev;
257         struct inet6_dev *idev = ip6_dst_idev(dst);
258         struct hop_jumbo_hdr *hop_jumbo;
259         int hoplen = sizeof(*hop_jumbo);
260         unsigned int head_room;
261         struct ipv6hdr *hdr;
262         u8  proto = fl6->flowi6_proto;
263         int seg_len = skb->len;
264         int hlimit = -1;
265         u32 mtu;
266
267         head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
268         if (opt)
269                 head_room += opt->opt_nflen + opt->opt_flen;
270
271         if (unlikely(head_room > skb_headroom(skb))) {
272                 skb = skb_expand_head(skb, head_room);
273                 if (!skb) {
274                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
275                         return -ENOBUFS;
276                 }
277         }
278
279         if (opt) {
280                 seg_len += opt->opt_nflen + opt->opt_flen;
281
282                 if (opt->opt_flen)
283                         ipv6_push_frag_opts(skb, opt, &proto);
284
285                 if (opt->opt_nflen)
286                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
287                                              &fl6->saddr);
288         }
289
290         if (unlikely(seg_len > IPV6_MAXPLEN)) {
291                 hop_jumbo = skb_push(skb, hoplen);
292
293                 hop_jumbo->nexthdr = proto;
294                 hop_jumbo->hdrlen = 0;
295                 hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
296                 hop_jumbo->tlv_len = 4;
297                 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
298
299                 proto = IPPROTO_HOPOPTS;
300                 seg_len = 0;
301                 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
302         }
303
304         skb_push(skb, sizeof(struct ipv6hdr));
305         skb_reset_network_header(skb);
306         hdr = ipv6_hdr(skb);
307
308         /*
309          *      Fill in the IPv6 header
310          */
311         if (np)
312                 hlimit = READ_ONCE(np->hop_limit);
313         if (hlimit < 0)
314                 hlimit = ip6_dst_hoplimit(dst);
315
316         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
317                                 ip6_autoflowlabel(net, np), fl6));
318
319         hdr->payload_len = htons(seg_len);
320         hdr->nexthdr = proto;
321         hdr->hop_limit = hlimit;
322
323         hdr->saddr = fl6->saddr;
324         hdr->daddr = *first_hop;
325
326         skb->protocol = htons(ETH_P_IPV6);
327         skb->priority = priority;
328         skb->mark = mark;
329
330         mtu = dst_mtu(dst);
331         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
332                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
333
334                 /* if egress device is enslaved to an L3 master device pass the
335                  * skb to its handler for processing
336                  */
337                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
338                 if (unlikely(!skb))
339                         return 0;
340
341                 /* hooks should never assume socket lock is held.
342                  * we promote our socket to non const
343                  */
344                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
345                                net, (struct sock *)sk, skb, NULL, dev,
346                                dst_output);
347         }
348
349         skb->dev = dev;
350         /* ipv6_local_error() does not require socket lock,
351          * we promote our socket to non const
352          */
353         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
354
355         IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
356         kfree_skb(skb);
357         return -EMSGSIZE;
358 }
359 EXPORT_SYMBOL(ip6_xmit);
360
361 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
362 {
363         struct ip6_ra_chain *ra;
364         struct sock *last = NULL;
365
366         read_lock(&ip6_ra_lock);
367         for (ra = ip6_ra_chain; ra; ra = ra->next) {
368                 struct sock *sk = ra->sk;
369                 if (sk && ra->sel == sel &&
370                     (!sk->sk_bound_dev_if ||
371                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
372                         struct ipv6_pinfo *np = inet6_sk(sk);
373
374                         if (np && np->rtalert_isolate &&
375                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
376                                 continue;
377                         }
378                         if (last) {
379                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
380                                 if (skb2)
381                                         rawv6_rcv(last, skb2);
382                         }
383                         last = sk;
384                 }
385         }
386
387         if (last) {
388                 rawv6_rcv(last, skb);
389                 read_unlock(&ip6_ra_lock);
390                 return 1;
391         }
392         read_unlock(&ip6_ra_lock);
393         return 0;
394 }
395
396 static int ip6_forward_proxy_check(struct sk_buff *skb)
397 {
398         struct ipv6hdr *hdr = ipv6_hdr(skb);
399         u8 nexthdr = hdr->nexthdr;
400         __be16 frag_off;
401         int offset;
402
403         if (ipv6_ext_hdr(nexthdr)) {
404                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
405                 if (offset < 0)
406                         return 0;
407         } else
408                 offset = sizeof(struct ipv6hdr);
409
410         if (nexthdr == IPPROTO_ICMPV6) {
411                 struct icmp6hdr *icmp6;
412
413                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
414                                          offset + 1 - skb->data)))
415                         return 0;
416
417                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
418
419                 switch (icmp6->icmp6_type) {
420                 case NDISC_ROUTER_SOLICITATION:
421                 case NDISC_ROUTER_ADVERTISEMENT:
422                 case NDISC_NEIGHBOUR_SOLICITATION:
423                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
424                 case NDISC_REDIRECT:
425                         /* For reaction involving unicast neighbor discovery
426                          * message destined to the proxied address, pass it to
427                          * input function.
428                          */
429                         return 1;
430                 default:
431                         break;
432                 }
433         }
434
435         /*
436          * The proxying router can't forward traffic sent to a link-local
437          * address, so signal the sender and discard the packet. This
438          * behavior is clarified by the MIPv6 specification.
439          */
440         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
441                 dst_link_failure(skb);
442                 return -1;
443         }
444
445         return 0;
446 }
447
448 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
449                                      struct sk_buff *skb)
450 {
451         struct dst_entry *dst = skb_dst(skb);
452
453         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
454
455 #ifdef CONFIG_NET_SWITCHDEV
456         if (skb->offload_l3_fwd_mark) {
457                 consume_skb(skb);
458                 return 0;
459         }
460 #endif
461
462         skb_clear_tstamp(skb);
463         return dst_output(net, sk, skb);
464 }
465
466 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
467 {
468         if (skb->len <= mtu)
469                 return false;
470
471         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
472         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
473                 return true;
474
475         if (skb->ignore_df)
476                 return false;
477
478         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
479                 return false;
480
481         return true;
482 }
483
484 int ip6_forward(struct sk_buff *skb)
485 {
486         struct dst_entry *dst = skb_dst(skb);
487         struct ipv6hdr *hdr = ipv6_hdr(skb);
488         struct inet6_skb_parm *opt = IP6CB(skb);
489         struct net *net = dev_net(dst->dev);
490         struct inet6_dev *idev;
491         SKB_DR(reason);
492         u32 mtu;
493
494         idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
495         if (net->ipv6.devconf_all->forwarding == 0)
496                 goto error;
497
498         if (skb->pkt_type != PACKET_HOST)
499                 goto drop;
500
501         if (unlikely(skb->sk))
502                 goto drop;
503
504         if (skb_warn_if_lro(skb))
505                 goto drop;
506
507         if (!net->ipv6.devconf_all->disable_policy &&
508             (!idev || !idev->cnf.disable_policy) &&
509             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
510                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
511                 goto drop;
512         }
513
514         skb_forward_csum(skb);
515
516         /*
517          *      We DO NOT make any processing on
518          *      RA packets, pushing them to user level AS IS
519          *      without ane WARRANTY that application will be able
520          *      to interpret them. The reason is that we
521          *      cannot make anything clever here.
522          *
523          *      We are not end-node, so that if packet contains
524          *      AH/ESP, we cannot make anything.
525          *      Defragmentation also would be mistake, RA packets
526          *      cannot be fragmented, because there is no warranty
527          *      that different fragments will go along one path. --ANK
528          */
529         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
530                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
531                         return 0;
532         }
533
534         /*
535          *      check and decrement ttl
536          */
537         if (hdr->hop_limit <= 1) {
538                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
539                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
540
541                 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
542                 return -ETIMEDOUT;
543         }
544
545         /* XXX: idev->cnf.proxy_ndp? */
546         if (net->ipv6.devconf_all->proxy_ndp &&
547             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
548                 int proxied = ip6_forward_proxy_check(skb);
549                 if (proxied > 0) {
550                         /* It's tempting to decrease the hop limit
551                          * here by 1, as we do at the end of the
552                          * function too.
553                          *
554                          * But that would be incorrect, as proxying is
555                          * not forwarding.  The ip6_input function
556                          * will handle this packet locally, and it
557                          * depends on the hop limit being unchanged.
558                          *
559                          * One example is the NDP hop limit, that
560                          * always has to stay 255, but other would be
561                          * similar checks around RA packets, where the
562                          * user can even change the desired limit.
563                          */
564                         return ip6_input(skb);
565                 } else if (proxied < 0) {
566                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
567                         goto drop;
568                 }
569         }
570
571         if (!xfrm6_route_forward(skb)) {
572                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
573                 SKB_DR_SET(reason, XFRM_POLICY);
574                 goto drop;
575         }
576         dst = skb_dst(skb);
577
578         /* IPv6 specs say nothing about it, but it is clear that we cannot
579            send redirects to source routed frames.
580            We don't send redirects to frames decapsulated from IPsec.
581          */
582         if (IP6CB(skb)->iif == dst->dev->ifindex &&
583             opt->srcrt == 0 && !skb_sec_path(skb)) {
584                 struct in6_addr *target = NULL;
585                 struct inet_peer *peer;
586                 struct rt6_info *rt;
587
588                 /*
589                  *      incoming and outgoing devices are the same
590                  *      send a redirect.
591                  */
592
593                 rt = (struct rt6_info *) dst;
594                 if (rt->rt6i_flags & RTF_GATEWAY)
595                         target = &rt->rt6i_gateway;
596                 else
597                         target = &hdr->daddr;
598
599                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
600
601                 /* Limit redirects both by destination (here)
602                    and by source (inside ndisc_send_redirect)
603                  */
604                 if (inet_peer_xrlim_allow(peer, 1*HZ))
605                         ndisc_send_redirect(skb, target);
606                 if (peer)
607                         inet_putpeer(peer);
608         } else {
609                 int addrtype = ipv6_addr_type(&hdr->saddr);
610
611                 /* This check is security critical. */
612                 if (addrtype == IPV6_ADDR_ANY ||
613                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
614                         goto error;
615                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
616                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
617                                     ICMPV6_NOT_NEIGHBOUR, 0);
618                         goto error;
619                 }
620         }
621
622         mtu = ip6_dst_mtu_maybe_forward(dst, true);
623         if (mtu < IPV6_MIN_MTU)
624                 mtu = IPV6_MIN_MTU;
625
626         if (ip6_pkt_too_big(skb, mtu)) {
627                 /* Again, force OUTPUT device used as source address */
628                 skb->dev = dst->dev;
629                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
630                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
631                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
632                                 IPSTATS_MIB_FRAGFAILS);
633                 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
634                 return -EMSGSIZE;
635         }
636
637         if (skb_cow(skb, dst->dev->hard_header_len)) {
638                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
639                                 IPSTATS_MIB_OUTDISCARDS);
640                 goto drop;
641         }
642
643         hdr = ipv6_hdr(skb);
644
645         /* Mangling hops number delayed to point after skb COW */
646
647         hdr->hop_limit--;
648
649         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
650                        net, NULL, skb, skb->dev, dst->dev,
651                        ip6_forward_finish);
652
653 error:
654         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
655         SKB_DR_SET(reason, IP_INADDRERRORS);
656 drop:
657         kfree_skb_reason(skb, reason);
658         return -EINVAL;
659 }
660
661 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
662 {
663         to->pkt_type = from->pkt_type;
664         to->priority = from->priority;
665         to->protocol = from->protocol;
666         skb_dst_drop(to);
667         skb_dst_set(to, dst_clone(skb_dst(from)));
668         to->dev = from->dev;
669         to->mark = from->mark;
670
671         skb_copy_hash(to, from);
672
673 #ifdef CONFIG_NET_SCHED
674         to->tc_index = from->tc_index;
675 #endif
676         nf_copy(to, from);
677         skb_ext_copy(to, from);
678         skb_copy_secmark(to, from);
679 }
680
681 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
682                       u8 nexthdr, __be32 frag_id,
683                       struct ip6_fraglist_iter *iter)
684 {
685         unsigned int first_len;
686         struct frag_hdr *fh;
687
688         /* BUILD HEADER */
689         *prevhdr = NEXTHDR_FRAGMENT;
690         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
691         if (!iter->tmp_hdr)
692                 return -ENOMEM;
693
694         iter->frag = skb_shinfo(skb)->frag_list;
695         skb_frag_list_init(skb);
696
697         iter->offset = 0;
698         iter->hlen = hlen;
699         iter->frag_id = frag_id;
700         iter->nexthdr = nexthdr;
701
702         __skb_pull(skb, hlen);
703         fh = __skb_push(skb, sizeof(struct frag_hdr));
704         __skb_push(skb, hlen);
705         skb_reset_network_header(skb);
706         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
707
708         fh->nexthdr = nexthdr;
709         fh->reserved = 0;
710         fh->frag_off = htons(IP6_MF);
711         fh->identification = frag_id;
712
713         first_len = skb_pagelen(skb);
714         skb->data_len = first_len - skb_headlen(skb);
715         skb->len = first_len;
716         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
717
718         return 0;
719 }
720 EXPORT_SYMBOL(ip6_fraglist_init);
721
722 void ip6_fraglist_prepare(struct sk_buff *skb,
723                           struct ip6_fraglist_iter *iter)
724 {
725         struct sk_buff *frag = iter->frag;
726         unsigned int hlen = iter->hlen;
727         struct frag_hdr *fh;
728
729         frag->ip_summed = CHECKSUM_NONE;
730         skb_reset_transport_header(frag);
731         fh = __skb_push(frag, sizeof(struct frag_hdr));
732         __skb_push(frag, hlen);
733         skb_reset_network_header(frag);
734         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
735         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
736         fh->nexthdr = iter->nexthdr;
737         fh->reserved = 0;
738         fh->frag_off = htons(iter->offset);
739         if (frag->next)
740                 fh->frag_off |= htons(IP6_MF);
741         fh->identification = iter->frag_id;
742         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
743         ip6_copy_metadata(frag, skb);
744 }
745 EXPORT_SYMBOL(ip6_fraglist_prepare);
746
747 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
748                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
749                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
750 {
751         state->prevhdr = prevhdr;
752         state->nexthdr = nexthdr;
753         state->frag_id = frag_id;
754
755         state->hlen = hlen;
756         state->mtu = mtu;
757
758         state->left = skb->len - hlen;  /* Space per frame */
759         state->ptr = hlen;              /* Where to start from */
760
761         state->hroom = hdr_room;
762         state->troom = needed_tailroom;
763
764         state->offset = 0;
765 }
766 EXPORT_SYMBOL(ip6_frag_init);
767
768 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
769 {
770         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
771         struct sk_buff *frag;
772         struct frag_hdr *fh;
773         unsigned int len;
774
775         len = state->left;
776         /* IF: it doesn't fit, use 'mtu' - the data space left */
777         if (len > state->mtu)
778                 len = state->mtu;
779         /* IF: we are not sending up to and including the packet end
780            then align the next start on an eight byte boundary */
781         if (len < state->left)
782                 len &= ~7;
783
784         /* Allocate buffer */
785         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
786                          state->hroom + state->troom, GFP_ATOMIC);
787         if (!frag)
788                 return ERR_PTR(-ENOMEM);
789
790         /*
791          *      Set up data on packet
792          */
793
794         ip6_copy_metadata(frag, skb);
795         skb_reserve(frag, state->hroom);
796         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
797         skb_reset_network_header(frag);
798         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
799         frag->transport_header = (frag->network_header + state->hlen +
800                                   sizeof(struct frag_hdr));
801
802         /*
803          *      Charge the memory for the fragment to any owner
804          *      it might possess
805          */
806         if (skb->sk)
807                 skb_set_owner_w(frag, skb->sk);
808
809         /*
810          *      Copy the packet header into the new buffer.
811          */
812         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
813
814         fragnexthdr_offset = skb_network_header(frag);
815         fragnexthdr_offset += prevhdr - skb_network_header(skb);
816         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
817
818         /*
819          *      Build fragment header.
820          */
821         fh->nexthdr = state->nexthdr;
822         fh->reserved = 0;
823         fh->identification = state->frag_id;
824
825         /*
826          *      Copy a block of the IP datagram.
827          */
828         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
829                              len));
830         state->left -= len;
831
832         fh->frag_off = htons(state->offset);
833         if (state->left > 0)
834                 fh->frag_off |= htons(IP6_MF);
835         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
836
837         state->ptr += len;
838         state->offset += len;
839
840         return frag;
841 }
842 EXPORT_SYMBOL(ip6_frag_next);
843
844 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
845                  int (*output)(struct net *, struct sock *, struct sk_buff *))
846 {
847         struct sk_buff *frag;
848         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
849         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
850                                 inet6_sk(skb->sk) : NULL;
851         bool mono_delivery_time = skb->mono_delivery_time;
852         struct ip6_frag_state state;
853         unsigned int mtu, hlen, nexthdr_offset;
854         ktime_t tstamp = skb->tstamp;
855         int hroom, err = 0;
856         __be32 frag_id;
857         u8 *prevhdr, nexthdr = 0;
858
859         err = ip6_find_1stfragopt(skb, &prevhdr);
860         if (err < 0)
861                 goto fail;
862         hlen = err;
863         nexthdr = *prevhdr;
864         nexthdr_offset = prevhdr - skb_network_header(skb);
865
866         mtu = ip6_skb_dst_mtu(skb);
867
868         /* We must not fragment if the socket is set to force MTU discovery
869          * or if the skb it not generated by a local socket.
870          */
871         if (unlikely(!skb->ignore_df && skb->len > mtu))
872                 goto fail_toobig;
873
874         if (IP6CB(skb)->frag_max_size) {
875                 if (IP6CB(skb)->frag_max_size > mtu)
876                         goto fail_toobig;
877
878                 /* don't send fragments larger than what we received */
879                 mtu = IP6CB(skb)->frag_max_size;
880                 if (mtu < IPV6_MIN_MTU)
881                         mtu = IPV6_MIN_MTU;
882         }
883
884         if (np && np->frag_size < mtu) {
885                 if (np->frag_size)
886                         mtu = np->frag_size;
887         }
888         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
889                 goto fail_toobig;
890         mtu -= hlen + sizeof(struct frag_hdr);
891
892         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
893                                     &ipv6_hdr(skb)->saddr);
894
895         if (skb->ip_summed == CHECKSUM_PARTIAL &&
896             (err = skb_checksum_help(skb)))
897                 goto fail;
898
899         prevhdr = skb_network_header(skb) + nexthdr_offset;
900         hroom = LL_RESERVED_SPACE(rt->dst.dev);
901         if (skb_has_frag_list(skb)) {
902                 unsigned int first_len = skb_pagelen(skb);
903                 struct ip6_fraglist_iter iter;
904                 struct sk_buff *frag2;
905
906                 if (first_len - hlen > mtu ||
907                     ((first_len - hlen) & 7) ||
908                     skb_cloned(skb) ||
909                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
910                         goto slow_path;
911
912                 skb_walk_frags(skb, frag) {
913                         /* Correct geometry. */
914                         if (frag->len > mtu ||
915                             ((frag->len & 7) && frag->next) ||
916                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
917                                 goto slow_path_clean;
918
919                         /* Partially cloned skb? */
920                         if (skb_shared(frag))
921                                 goto slow_path_clean;
922
923                         BUG_ON(frag->sk);
924                         if (skb->sk) {
925                                 frag->sk = skb->sk;
926                                 frag->destructor = sock_wfree;
927                         }
928                         skb->truesize -= frag->truesize;
929                 }
930
931                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
932                                         &iter);
933                 if (err < 0)
934                         goto fail;
935
936                 /* We prevent @rt from being freed. */
937                 rcu_read_lock();
938
939                 for (;;) {
940                         /* Prepare header of the next frame,
941                          * before previous one went down. */
942                         if (iter.frag)
943                                 ip6_fraglist_prepare(skb, &iter);
944
945                         skb_set_delivery_time(skb, tstamp, mono_delivery_time);
946                         err = output(net, sk, skb);
947                         if (!err)
948                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
949                                               IPSTATS_MIB_FRAGCREATES);
950
951                         if (err || !iter.frag)
952                                 break;
953
954                         skb = ip6_fraglist_next(&iter);
955                 }
956
957                 kfree(iter.tmp_hdr);
958
959                 if (err == 0) {
960                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
961                                       IPSTATS_MIB_FRAGOKS);
962                         rcu_read_unlock();
963                         return 0;
964                 }
965
966                 kfree_skb_list(iter.frag);
967
968                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
969                               IPSTATS_MIB_FRAGFAILS);
970                 rcu_read_unlock();
971                 return err;
972
973 slow_path_clean:
974                 skb_walk_frags(skb, frag2) {
975                         if (frag2 == frag)
976                                 break;
977                         frag2->sk = NULL;
978                         frag2->destructor = NULL;
979                         skb->truesize += frag2->truesize;
980                 }
981         }
982
983 slow_path:
984         /*
985          *      Fragment the datagram.
986          */
987
988         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
989                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
990                       &state);
991
992         /*
993          *      Keep copying data until we run out.
994          */
995
996         while (state.left > 0) {
997                 frag = ip6_frag_next(skb, &state);
998                 if (IS_ERR(frag)) {
999                         err = PTR_ERR(frag);
1000                         goto fail;
1001                 }
1002
1003                 /*
1004                  *      Put this fragment into the sending queue.
1005                  */
1006                 skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1007                 err = output(net, sk, frag);
1008                 if (err)
1009                         goto fail;
1010
1011                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1012                               IPSTATS_MIB_FRAGCREATES);
1013         }
1014         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1015                       IPSTATS_MIB_FRAGOKS);
1016         consume_skb(skb);
1017         return err;
1018
1019 fail_toobig:
1020         if (skb->sk && dst_allfrag(skb_dst(skb)))
1021                 sk_gso_disable(skb->sk);
1022
1023         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1024         err = -EMSGSIZE;
1025
1026 fail:
1027         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1028                       IPSTATS_MIB_FRAGFAILS);
1029         kfree_skb(skb);
1030         return err;
1031 }
1032
1033 static inline int ip6_rt_check(const struct rt6key *rt_key,
1034                                const struct in6_addr *fl_addr,
1035                                const struct in6_addr *addr_cache)
1036 {
1037         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1038                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1039 }
1040
1041 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1042                                           struct dst_entry *dst,
1043                                           const struct flowi6 *fl6)
1044 {
1045         struct ipv6_pinfo *np = inet6_sk(sk);
1046         struct rt6_info *rt;
1047
1048         if (!dst)
1049                 goto out;
1050
1051         if (dst->ops->family != AF_INET6) {
1052                 dst_release(dst);
1053                 return NULL;
1054         }
1055
1056         rt = (struct rt6_info *)dst;
1057         /* Yes, checking route validity in not connected
1058          * case is not very simple. Take into account,
1059          * that we do not support routing by source, TOS,
1060          * and MSG_DONTROUTE            --ANK (980726)
1061          *
1062          * 1. ip6_rt_check(): If route was host route,
1063          *    check that cached destination is current.
1064          *    If it is network route, we still may
1065          *    check its validity using saved pointer
1066          *    to the last used address: daddr_cache.
1067          *    We do not want to save whole address now,
1068          *    (because main consumer of this service
1069          *    is tcp, which has not this problem),
1070          *    so that the last trick works only on connected
1071          *    sockets.
1072          * 2. oif also should be the same.
1073          */
1074         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1075 #ifdef CONFIG_IPV6_SUBTREES
1076             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1077 #endif
1078            (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1079                 dst_release(dst);
1080                 dst = NULL;
1081         }
1082
1083 out:
1084         return dst;
1085 }
1086
1087 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1088                                struct dst_entry **dst, struct flowi6 *fl6)
1089 {
1090 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1091         struct neighbour *n;
1092         struct rt6_info *rt;
1093 #endif
1094         int err;
1095         int flags = 0;
1096
1097         /* The correct way to handle this would be to do
1098          * ip6_route_get_saddr, and then ip6_route_output; however,
1099          * the route-specific preferred source forces the
1100          * ip6_route_output call _before_ ip6_route_get_saddr.
1101          *
1102          * In source specific routing (no src=any default route),
1103          * ip6_route_output will fail given src=any saddr, though, so
1104          * that's why we try it again later.
1105          */
1106         if (ipv6_addr_any(&fl6->saddr)) {
1107                 struct fib6_info *from;
1108                 struct rt6_info *rt;
1109
1110                 *dst = ip6_route_output(net, sk, fl6);
1111                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1112
1113                 rcu_read_lock();
1114                 from = rt ? rcu_dereference(rt->from) : NULL;
1115                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1116                                           sk ? inet6_sk(sk)->srcprefs : 0,
1117                                           &fl6->saddr);
1118                 rcu_read_unlock();
1119
1120                 if (err)
1121                         goto out_err_release;
1122
1123                 /* If we had an erroneous initial result, pretend it
1124                  * never existed and let the SA-enabled version take
1125                  * over.
1126                  */
1127                 if ((*dst)->error) {
1128                         dst_release(*dst);
1129                         *dst = NULL;
1130                 }
1131
1132                 if (fl6->flowi6_oif)
1133                         flags |= RT6_LOOKUP_F_IFACE;
1134         }
1135
1136         if (!*dst)
1137                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1138
1139         err = (*dst)->error;
1140         if (err)
1141                 goto out_err_release;
1142
1143 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1144         /*
1145          * Here if the dst entry we've looked up
1146          * has a neighbour entry that is in the INCOMPLETE
1147          * state and the src address from the flow is
1148          * marked as OPTIMISTIC, we release the found
1149          * dst entry and replace it instead with the
1150          * dst entry of the nexthop router
1151          */
1152         rt = (struct rt6_info *) *dst;
1153         rcu_read_lock();
1154         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1155                                       rt6_nexthop(rt, &fl6->daddr));
1156         err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1157         rcu_read_unlock();
1158
1159         if (err) {
1160                 struct inet6_ifaddr *ifp;
1161                 struct flowi6 fl_gw6;
1162                 int redirect;
1163
1164                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1165                                       (*dst)->dev, 1);
1166
1167                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1168                 if (ifp)
1169                         in6_ifa_put(ifp);
1170
1171                 if (redirect) {
1172                         /*
1173                          * We need to get the dst entry for the
1174                          * default router instead
1175                          */
1176                         dst_release(*dst);
1177                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1178                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1179                         *dst = ip6_route_output(net, sk, &fl_gw6);
1180                         err = (*dst)->error;
1181                         if (err)
1182                                 goto out_err_release;
1183                 }
1184         }
1185 #endif
1186         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1187             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1188                 err = -EAFNOSUPPORT;
1189                 goto out_err_release;
1190         }
1191
1192         return 0;
1193
1194 out_err_release:
1195         dst_release(*dst);
1196         *dst = NULL;
1197
1198         if (err == -ENETUNREACH)
1199                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1200         return err;
1201 }
1202
1203 /**
1204  *      ip6_dst_lookup - perform route lookup on flow
1205  *      @net: Network namespace to perform lookup in
1206  *      @sk: socket which provides route info
1207  *      @dst: pointer to dst_entry * for result
1208  *      @fl6: flow to lookup
1209  *
1210  *      This function performs a route lookup on the given flow.
1211  *
1212  *      It returns zero on success, or a standard errno code on error.
1213  */
1214 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1215                    struct flowi6 *fl6)
1216 {
1217         *dst = NULL;
1218         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1219 }
1220 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1221
1222 /**
1223  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1224  *      @net: Network namespace to perform lookup in
1225  *      @sk: socket which provides route info
1226  *      @fl6: flow to lookup
1227  *      @final_dst: final destination address for ipsec lookup
1228  *
1229  *      This function performs a route lookup on the given flow.
1230  *
1231  *      It returns a valid dst pointer on success, or a pointer encoded
1232  *      error code.
1233  */
1234 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1235                                       const struct in6_addr *final_dst)
1236 {
1237         struct dst_entry *dst = NULL;
1238         int err;
1239
1240         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1241         if (err)
1242                 return ERR_PTR(err);
1243         if (final_dst)
1244                 fl6->daddr = *final_dst;
1245
1246         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1247 }
1248 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1249
1250 /**
1251  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1252  *      @sk: socket which provides the dst cache and route info
1253  *      @fl6: flow to lookup
1254  *      @final_dst: final destination address for ipsec lookup
1255  *      @connected: whether @sk is connected or not
1256  *
1257  *      This function performs a route lookup on the given flow with the
1258  *      possibility of using the cached route in the socket if it is valid.
1259  *      It will take the socket dst lock when operating on the dst cache.
1260  *      As a result, this function can only be used in process context.
1261  *
1262  *      In addition, for a connected socket, cache the dst in the socket
1263  *      if the current cache is not valid.
1264  *
1265  *      It returns a valid dst pointer on success, or a pointer encoded
1266  *      error code.
1267  */
1268 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1269                                          const struct in6_addr *final_dst,
1270                                          bool connected)
1271 {
1272         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1273
1274         dst = ip6_sk_dst_check(sk, dst, fl6);
1275         if (dst)
1276                 return dst;
1277
1278         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1279         if (connected && !IS_ERR(dst))
1280                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1281
1282         return dst;
1283 }
1284 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1285
1286 /**
1287  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1288  *      @skb: Packet for which lookup is done
1289  *      @dev: Tunnel device
1290  *      @net: Network namespace of tunnel device
1291  *      @sock: Socket which provides route info
1292  *      @saddr: Memory to store the src ip address
1293  *      @info: Tunnel information
1294  *      @protocol: IP protocol
1295  *      @use_cache: Flag to enable cache usage
1296  *      This function performs a route lookup on a tunnel
1297  *
1298  *      It returns a valid dst pointer and stores src address to be used in
1299  *      tunnel in param saddr on success, else a pointer encoded error code.
1300  */
1301
1302 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1303                                         struct net_device *dev,
1304                                         struct net *net,
1305                                         struct socket *sock,
1306                                         struct in6_addr *saddr,
1307                                         const struct ip_tunnel_info *info,
1308                                         u8 protocol,
1309                                         bool use_cache)
1310 {
1311         struct dst_entry *dst = NULL;
1312 #ifdef CONFIG_DST_CACHE
1313         struct dst_cache *dst_cache;
1314 #endif
1315         struct flowi6 fl6;
1316         __u8 prio;
1317
1318 #ifdef CONFIG_DST_CACHE
1319         dst_cache = (struct dst_cache *)&info->dst_cache;
1320         if (use_cache) {
1321                 dst = dst_cache_get_ip6(dst_cache, saddr);
1322                 if (dst)
1323                         return dst;
1324         }
1325 #endif
1326         memset(&fl6, 0, sizeof(fl6));
1327         fl6.flowi6_mark = skb->mark;
1328         fl6.flowi6_proto = protocol;
1329         fl6.daddr = info->key.u.ipv6.dst;
1330         fl6.saddr = info->key.u.ipv6.src;
1331         prio = info->key.tos;
1332         fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1333
1334         dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1335                                               NULL);
1336         if (IS_ERR(dst)) {
1337                 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1338                 return ERR_PTR(-ENETUNREACH);
1339         }
1340         if (dst->dev == dev) { /* is this necessary? */
1341                 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1342                 dst_release(dst);
1343                 return ERR_PTR(-ELOOP);
1344         }
1345 #ifdef CONFIG_DST_CACHE
1346         if (use_cache)
1347                 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1348 #endif
1349         *saddr = fl6.saddr;
1350         return dst;
1351 }
1352 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1353
1354 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1355                                                gfp_t gfp)
1356 {
1357         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1358 }
1359
1360 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1361                                                 gfp_t gfp)
1362 {
1363         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1364 }
1365
1366 static void ip6_append_data_mtu(unsigned int *mtu,
1367                                 int *maxfraglen,
1368                                 unsigned int fragheaderlen,
1369                                 struct sk_buff *skb,
1370                                 struct rt6_info *rt,
1371                                 unsigned int orig_mtu)
1372 {
1373         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1374                 if (!skb) {
1375                         /* first fragment, reserve header_len */
1376                         *mtu = orig_mtu - rt->dst.header_len;
1377
1378                 } else {
1379                         /*
1380                          * this fragment is not first, the headers
1381                          * space is regarded as data space.
1382                          */
1383                         *mtu = orig_mtu;
1384                 }
1385                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1386                               + fragheaderlen - sizeof(struct frag_hdr);
1387         }
1388 }
1389
1390 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1391                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1392                           struct rt6_info *rt)
1393 {
1394         struct ipv6_pinfo *np = inet6_sk(sk);
1395         unsigned int mtu;
1396         struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1397
1398         /* callers pass dst together with a reference, set it first so
1399          * ip6_cork_release() can put it down even in case of an error.
1400          */
1401         cork->base.dst = &rt->dst;
1402
1403         /*
1404          * setup for corking
1405          */
1406         if (opt) {
1407                 if (WARN_ON(v6_cork->opt))
1408                         return -EINVAL;
1409
1410                 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1411                 if (unlikely(!nopt))
1412                         return -ENOBUFS;
1413
1414                 nopt->tot_len = sizeof(*opt);
1415                 nopt->opt_flen = opt->opt_flen;
1416                 nopt->opt_nflen = opt->opt_nflen;
1417
1418                 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1419                 if (opt->dst0opt && !nopt->dst0opt)
1420                         return -ENOBUFS;
1421
1422                 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1423                 if (opt->dst1opt && !nopt->dst1opt)
1424                         return -ENOBUFS;
1425
1426                 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1427                 if (opt->hopopt && !nopt->hopopt)
1428                         return -ENOBUFS;
1429
1430                 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1431                 if (opt->srcrt && !nopt->srcrt)
1432                         return -ENOBUFS;
1433
1434                 /* need source address above miyazawa*/
1435         }
1436         v6_cork->hop_limit = ipc6->hlimit;
1437         v6_cork->tclass = ipc6->tclass;
1438         if (rt->dst.flags & DST_XFRM_TUNNEL)
1439                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1440                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1441         else
1442                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1443                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1444         if (np->frag_size < mtu) {
1445                 if (np->frag_size)
1446                         mtu = np->frag_size;
1447         }
1448         cork->base.fragsize = mtu;
1449         cork->base.gso_size = ipc6->gso_size;
1450         cork->base.tx_flags = 0;
1451         cork->base.mark = ipc6->sockc.mark;
1452         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1453
1454         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1455                 cork->base.flags |= IPCORK_ALLFRAG;
1456         cork->base.length = 0;
1457
1458         cork->base.transmit_time = ipc6->sockc.transmit_time;
1459
1460         return 0;
1461 }
1462
1463 static int __ip6_append_data(struct sock *sk,
1464                              struct sk_buff_head *queue,
1465                              struct inet_cork_full *cork_full,
1466                              struct inet6_cork *v6_cork,
1467                              struct page_frag *pfrag,
1468                              int getfrag(void *from, char *to, int offset,
1469                                          int len, int odd, struct sk_buff *skb),
1470                              void *from, size_t length, int transhdrlen,
1471                              unsigned int flags, struct ipcm6_cookie *ipc6)
1472 {
1473         struct sk_buff *skb, *skb_prev = NULL;
1474         struct inet_cork *cork = &cork_full->base;
1475         struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1476         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1477         struct ubuf_info *uarg = NULL;
1478         int exthdrlen = 0;
1479         int dst_exthdrlen = 0;
1480         int hh_len;
1481         int copy;
1482         int err;
1483         int offset = 0;
1484         bool zc = false;
1485         u32 tskey = 0;
1486         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1487         struct ipv6_txoptions *opt = v6_cork->opt;
1488         int csummode = CHECKSUM_NONE;
1489         unsigned int maxnonfragsize, headersize;
1490         unsigned int wmem_alloc_delta = 0;
1491         bool paged, extra_uref = false;
1492
1493         skb = skb_peek_tail(queue);
1494         if (!skb) {
1495                 exthdrlen = opt ? opt->opt_flen : 0;
1496                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1497         }
1498
1499         paged = !!cork->gso_size;
1500         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1501         orig_mtu = mtu;
1502
1503         if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1504             READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID)
1505                 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1506
1507         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1508
1509         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1510                         (opt ? opt->opt_nflen : 0);
1511
1512         headersize = sizeof(struct ipv6hdr) +
1513                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1514                      (dst_allfrag(&rt->dst) ?
1515                       sizeof(struct frag_hdr) : 0) +
1516                      rt->rt6i_nfheader_len;
1517
1518         if (mtu <= fragheaderlen ||
1519             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1520                 goto emsgsize;
1521
1522         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1523                      sizeof(struct frag_hdr);
1524
1525         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1526          * the first fragment
1527          */
1528         if (headersize + transhdrlen > mtu)
1529                 goto emsgsize;
1530
1531         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1532             (sk->sk_protocol == IPPROTO_UDP ||
1533              sk->sk_protocol == IPPROTO_ICMPV6 ||
1534              sk->sk_protocol == IPPROTO_RAW)) {
1535                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1536                                 sizeof(struct ipv6hdr));
1537                 goto emsgsize;
1538         }
1539
1540         if (ip6_sk_ignore_df(sk))
1541                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1542         else
1543                 maxnonfragsize = mtu;
1544
1545         if (cork->length + length > maxnonfragsize - headersize) {
1546 emsgsize:
1547                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1548                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1549                 return -EMSGSIZE;
1550         }
1551
1552         /* CHECKSUM_PARTIAL only with no extension headers and when
1553          * we are not going to fragment
1554          */
1555         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1556             headersize == sizeof(struct ipv6hdr) &&
1557             length <= mtu - headersize &&
1558             (!(flags & MSG_MORE) || cork->gso_size) &&
1559             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1560                 csummode = CHECKSUM_PARTIAL;
1561
1562         if ((flags & MSG_ZEROCOPY) && length) {
1563                 struct msghdr *msg = from;
1564
1565                 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1566                         if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1567                                 return -EINVAL;
1568
1569                         /* Leave uarg NULL if can't zerocopy, callers should
1570                          * be able to handle it.
1571                          */
1572                         if ((rt->dst.dev->features & NETIF_F_SG) &&
1573                             csummode == CHECKSUM_PARTIAL) {
1574                                 paged = true;
1575                                 zc = true;
1576                                 uarg = msg->msg_ubuf;
1577                         }
1578                 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1579                         uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1580                         if (!uarg)
1581                                 return -ENOBUFS;
1582                         extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1583                         if (rt->dst.dev->features & NETIF_F_SG &&
1584                             csummode == CHECKSUM_PARTIAL) {
1585                                 paged = true;
1586                                 zc = true;
1587                         } else {
1588                                 uarg_to_msgzc(uarg)->zerocopy = 0;
1589                                 skb_zcopy_set(skb, uarg, &extra_uref);
1590                         }
1591                 }
1592         } else if ((flags & MSG_SPLICE_PAGES) && length) {
1593                 if (inet_test_bit(HDRINCL, sk))
1594                         return -EPERM;
1595                 if (rt->dst.dev->features & NETIF_F_SG &&
1596                     getfrag == ip_generic_getfrag)
1597                         /* We need an empty buffer to attach stuff to */
1598                         paged = true;
1599                 else
1600                         flags &= ~MSG_SPLICE_PAGES;
1601         }
1602
1603         /*
1604          * Let's try using as much space as possible.
1605          * Use MTU if total length of the message fits into the MTU.
1606          * Otherwise, we need to reserve fragment header and
1607          * fragment alignment (= 8-15 octects, in total).
1608          *
1609          * Note that we may need to "move" the data from the tail
1610          * of the buffer to the new fragment when we split
1611          * the message.
1612          *
1613          * FIXME: It may be fragmented into multiple chunks
1614          *        at once if non-fragmentable extension headers
1615          *        are too large.
1616          * --yoshfuji
1617          */
1618
1619         cork->length += length;
1620         if (!skb)
1621                 goto alloc_new_skb;
1622
1623         while (length > 0) {
1624                 /* Check if the remaining data fits into current packet. */
1625                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1626                 if (copy < length)
1627                         copy = maxfraglen - skb->len;
1628
1629                 if (copy <= 0) {
1630                         char *data;
1631                         unsigned int datalen;
1632                         unsigned int fraglen;
1633                         unsigned int fraggap;
1634                         unsigned int alloclen, alloc_extra;
1635                         unsigned int pagedlen;
1636 alloc_new_skb:
1637                         /* There's no room in the current skb */
1638                         if (skb)
1639                                 fraggap = skb->len - maxfraglen;
1640                         else
1641                                 fraggap = 0;
1642                         /* update mtu and maxfraglen if necessary */
1643                         if (!skb || !skb_prev)
1644                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1645                                                     fragheaderlen, skb, rt,
1646                                                     orig_mtu);
1647
1648                         skb_prev = skb;
1649
1650                         /*
1651                          * If remaining data exceeds the mtu,
1652                          * we know we need more fragment(s).
1653                          */
1654                         datalen = length + fraggap;
1655
1656                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1657                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1658                         fraglen = datalen + fragheaderlen;
1659                         pagedlen = 0;
1660
1661                         alloc_extra = hh_len;
1662                         alloc_extra += dst_exthdrlen;
1663                         alloc_extra += rt->dst.trailer_len;
1664
1665                         /* We just reserve space for fragment header.
1666                          * Note: this may be overallocation if the message
1667                          * (without MSG_MORE) fits into the MTU.
1668                          */
1669                         alloc_extra += sizeof(struct frag_hdr);
1670
1671                         if ((flags & MSG_MORE) &&
1672                             !(rt->dst.dev->features&NETIF_F_SG))
1673                                 alloclen = mtu;
1674                         else if (!paged &&
1675                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1676                                   !(rt->dst.dev->features & NETIF_F_SG)))
1677                                 alloclen = fraglen;
1678                         else {
1679                                 alloclen = fragheaderlen + transhdrlen;
1680                                 pagedlen = datalen - transhdrlen;
1681                         }
1682                         alloclen += alloc_extra;
1683
1684                         if (datalen != length + fraggap) {
1685                                 /*
1686                                  * this is not the last fragment, the trailer
1687                                  * space is regarded as data space.
1688                                  */
1689                                 datalen += rt->dst.trailer_len;
1690                         }
1691
1692                         fraglen = datalen + fragheaderlen;
1693
1694                         copy = datalen - transhdrlen - fraggap - pagedlen;
1695                         /* [!] NOTE: copy may be negative if pagedlen>0
1696                          * because then the equation may reduces to -fraggap.
1697                          */
1698                         if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1699                                 err = -EINVAL;
1700                                 goto error;
1701                         }
1702                         if (transhdrlen) {
1703                                 skb = sock_alloc_send_skb(sk, alloclen,
1704                                                 (flags & MSG_DONTWAIT), &err);
1705                         } else {
1706                                 skb = NULL;
1707                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1708                                     2 * sk->sk_sndbuf)
1709                                         skb = alloc_skb(alloclen,
1710                                                         sk->sk_allocation);
1711                                 if (unlikely(!skb))
1712                                         err = -ENOBUFS;
1713                         }
1714                         if (!skb)
1715                                 goto error;
1716                         /*
1717                          *      Fill in the control structures
1718                          */
1719                         skb->protocol = htons(ETH_P_IPV6);
1720                         skb->ip_summed = csummode;
1721                         skb->csum = 0;
1722                         /* reserve for fragmentation and ipsec header */
1723                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1724                                     dst_exthdrlen);
1725
1726                         /*
1727                          *      Find where to start putting bytes
1728                          */
1729                         data = skb_put(skb, fraglen - pagedlen);
1730                         skb_set_network_header(skb, exthdrlen);
1731                         data += fragheaderlen;
1732                         skb->transport_header = (skb->network_header +
1733                                                  fragheaderlen);
1734                         if (fraggap) {
1735                                 skb->csum = skb_copy_and_csum_bits(
1736                                         skb_prev, maxfraglen,
1737                                         data + transhdrlen, fraggap);
1738                                 skb_prev->csum = csum_sub(skb_prev->csum,
1739                                                           skb->csum);
1740                                 data += fraggap;
1741                                 pskb_trim_unique(skb_prev, maxfraglen);
1742                         }
1743                         if (copy > 0 &&
1744                             getfrag(from, data + transhdrlen, offset,
1745                                     copy, fraggap, skb) < 0) {
1746                                 err = -EFAULT;
1747                                 kfree_skb(skb);
1748                                 goto error;
1749                         } else if (flags & MSG_SPLICE_PAGES) {
1750                                 copy = 0;
1751                         }
1752
1753                         offset += copy;
1754                         length -= copy + transhdrlen;
1755                         transhdrlen = 0;
1756                         exthdrlen = 0;
1757                         dst_exthdrlen = 0;
1758
1759                         /* Only the initial fragment is time stamped */
1760                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1761                         cork->tx_flags = 0;
1762                         skb_shinfo(skb)->tskey = tskey;
1763                         tskey = 0;
1764                         skb_zcopy_set(skb, uarg, &extra_uref);
1765
1766                         if ((flags & MSG_CONFIRM) && !skb_prev)
1767                                 skb_set_dst_pending_confirm(skb, 1);
1768
1769                         /*
1770                          * Put the packet on the pending queue
1771                          */
1772                         if (!skb->destructor) {
1773                                 skb->destructor = sock_wfree;
1774                                 skb->sk = sk;
1775                                 wmem_alloc_delta += skb->truesize;
1776                         }
1777                         __skb_queue_tail(queue, skb);
1778                         continue;
1779                 }
1780
1781                 if (copy > length)
1782                         copy = length;
1783
1784                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1785                     skb_tailroom(skb) >= copy) {
1786                         unsigned int off;
1787
1788                         off = skb->len;
1789                         if (getfrag(from, skb_put(skb, copy),
1790                                                 offset, copy, off, skb) < 0) {
1791                                 __skb_trim(skb, off);
1792                                 err = -EFAULT;
1793                                 goto error;
1794                         }
1795                 } else if (flags & MSG_SPLICE_PAGES) {
1796                         struct msghdr *msg = from;
1797
1798                         err = -EIO;
1799                         if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1800                                 goto error;
1801
1802                         err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1803                                                    sk->sk_allocation);
1804                         if (err < 0)
1805                                 goto error;
1806                         copy = err;
1807                         wmem_alloc_delta += copy;
1808                 } else if (!zc) {
1809                         int i = skb_shinfo(skb)->nr_frags;
1810
1811                         err = -ENOMEM;
1812                         if (!sk_page_frag_refill(sk, pfrag))
1813                                 goto error;
1814
1815                         skb_zcopy_downgrade_managed(skb);
1816                         if (!skb_can_coalesce(skb, i, pfrag->page,
1817                                               pfrag->offset)) {
1818                                 err = -EMSGSIZE;
1819                                 if (i == MAX_SKB_FRAGS)
1820                                         goto error;
1821
1822                                 __skb_fill_page_desc(skb, i, pfrag->page,
1823                                                      pfrag->offset, 0);
1824                                 skb_shinfo(skb)->nr_frags = ++i;
1825                                 get_page(pfrag->page);
1826                         }
1827                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1828                         if (getfrag(from,
1829                                     page_address(pfrag->page) + pfrag->offset,
1830                                     offset, copy, skb->len, skb) < 0)
1831                                 goto error_efault;
1832
1833                         pfrag->offset += copy;
1834                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1835                         skb->len += copy;
1836                         skb->data_len += copy;
1837                         skb->truesize += copy;
1838                         wmem_alloc_delta += copy;
1839                 } else {
1840                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1841                         if (err < 0)
1842                                 goto error;
1843                 }
1844                 offset += copy;
1845                 length -= copy;
1846         }
1847
1848         if (wmem_alloc_delta)
1849                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1850         return 0;
1851
1852 error_efault:
1853         err = -EFAULT;
1854 error:
1855         net_zcopy_put_abort(uarg, extra_uref);
1856         cork->length -= length;
1857         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1858         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1859         return err;
1860 }
1861
1862 int ip6_append_data(struct sock *sk,
1863                     int getfrag(void *from, char *to, int offset, int len,
1864                                 int odd, struct sk_buff *skb),
1865                     void *from, size_t length, int transhdrlen,
1866                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1867                     struct rt6_info *rt, unsigned int flags)
1868 {
1869         struct inet_sock *inet = inet_sk(sk);
1870         struct ipv6_pinfo *np = inet6_sk(sk);
1871         int exthdrlen;
1872         int err;
1873
1874         if (flags&MSG_PROBE)
1875                 return 0;
1876         if (skb_queue_empty(&sk->sk_write_queue)) {
1877                 /*
1878                  * setup for corking
1879                  */
1880                 dst_hold(&rt->dst);
1881                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1882                                      ipc6, rt);
1883                 if (err)
1884                         return err;
1885
1886                 inet->cork.fl.u.ip6 = *fl6;
1887                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1888                 length += exthdrlen;
1889                 transhdrlen += exthdrlen;
1890         } else {
1891                 transhdrlen = 0;
1892         }
1893
1894         return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1895                                  &np->cork, sk_page_frag(sk), getfrag,
1896                                  from, length, transhdrlen, flags, ipc6);
1897 }
1898 EXPORT_SYMBOL_GPL(ip6_append_data);
1899
1900 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1901 {
1902         struct dst_entry *dst = cork->base.dst;
1903
1904         cork->base.dst = NULL;
1905         cork->base.flags &= ~IPCORK_ALLFRAG;
1906         skb_dst_set(skb, dst);
1907 }
1908
1909 static void ip6_cork_release(struct inet_cork_full *cork,
1910                              struct inet6_cork *v6_cork)
1911 {
1912         if (v6_cork->opt) {
1913                 struct ipv6_txoptions *opt = v6_cork->opt;
1914
1915                 kfree(opt->dst0opt);
1916                 kfree(opt->dst1opt);
1917                 kfree(opt->hopopt);
1918                 kfree(opt->srcrt);
1919                 kfree(opt);
1920                 v6_cork->opt = NULL;
1921         }
1922
1923         if (cork->base.dst) {
1924                 dst_release(cork->base.dst);
1925                 cork->base.dst = NULL;
1926                 cork->base.flags &= ~IPCORK_ALLFRAG;
1927         }
1928 }
1929
1930 struct sk_buff *__ip6_make_skb(struct sock *sk,
1931                                struct sk_buff_head *queue,
1932                                struct inet_cork_full *cork,
1933                                struct inet6_cork *v6_cork)
1934 {
1935         struct sk_buff *skb, *tmp_skb;
1936         struct sk_buff **tail_skb;
1937         struct in6_addr *final_dst;
1938         struct ipv6_pinfo *np = inet6_sk(sk);
1939         struct net *net = sock_net(sk);
1940         struct ipv6hdr *hdr;
1941         struct ipv6_txoptions *opt = v6_cork->opt;
1942         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1943         struct flowi6 *fl6 = &cork->fl.u.ip6;
1944         unsigned char proto = fl6->flowi6_proto;
1945
1946         skb = __skb_dequeue(queue);
1947         if (!skb)
1948                 goto out;
1949         tail_skb = &(skb_shinfo(skb)->frag_list);
1950
1951         /* move skb->data to ip header from ext header */
1952         if (skb->data < skb_network_header(skb))
1953                 __skb_pull(skb, skb_network_offset(skb));
1954         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1955                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1956                 *tail_skb = tmp_skb;
1957                 tail_skb = &(tmp_skb->next);
1958                 skb->len += tmp_skb->len;
1959                 skb->data_len += tmp_skb->len;
1960                 skb->truesize += tmp_skb->truesize;
1961                 tmp_skb->destructor = NULL;
1962                 tmp_skb->sk = NULL;
1963         }
1964
1965         /* Allow local fragmentation. */
1966         skb->ignore_df = ip6_sk_ignore_df(sk);
1967         __skb_pull(skb, skb_network_header_len(skb));
1968
1969         final_dst = &fl6->daddr;
1970         if (opt && opt->opt_flen)
1971                 ipv6_push_frag_opts(skb, opt, &proto);
1972         if (opt && opt->opt_nflen)
1973                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1974
1975         skb_push(skb, sizeof(struct ipv6hdr));
1976         skb_reset_network_header(skb);
1977         hdr = ipv6_hdr(skb);
1978
1979         ip6_flow_hdr(hdr, v6_cork->tclass,
1980                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1981                                         ip6_autoflowlabel(net, np), fl6));
1982         hdr->hop_limit = v6_cork->hop_limit;
1983         hdr->nexthdr = proto;
1984         hdr->saddr = fl6->saddr;
1985         hdr->daddr = *final_dst;
1986
1987         skb->priority = sk->sk_priority;
1988         skb->mark = cork->base.mark;
1989         skb->tstamp = cork->base.transmit_time;
1990
1991         ip6_cork_steal_dst(skb, cork);
1992         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1993         if (proto == IPPROTO_ICMPV6) {
1994                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1995                 u8 icmp6_type;
1996
1997                 if (sk->sk_socket->type == SOCK_RAW &&
1998                    !inet_test_bit(HDRINCL, sk))
1999                         icmp6_type = fl6->fl6_icmp_type;
2000                 else
2001                         icmp6_type = icmp6_hdr(skb)->icmp6_type;
2002                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
2003                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
2004         }
2005
2006         ip6_cork_release(cork, v6_cork);
2007 out:
2008         return skb;
2009 }
2010
2011 int ip6_send_skb(struct sk_buff *skb)
2012 {
2013         struct net *net = sock_net(skb->sk);
2014         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
2015         int err;
2016
2017         err = ip6_local_out(net, skb->sk, skb);
2018         if (err) {
2019                 if (err > 0)
2020                         err = net_xmit_errno(err);
2021                 if (err)
2022                         IP6_INC_STATS(net, rt->rt6i_idev,
2023                                       IPSTATS_MIB_OUTDISCARDS);
2024         }
2025
2026         return err;
2027 }
2028
2029 int ip6_push_pending_frames(struct sock *sk)
2030 {
2031         struct sk_buff *skb;
2032
2033         skb = ip6_finish_skb(sk);
2034         if (!skb)
2035                 return 0;
2036
2037         return ip6_send_skb(skb);
2038 }
2039 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2040
2041 static void __ip6_flush_pending_frames(struct sock *sk,
2042                                        struct sk_buff_head *queue,
2043                                        struct inet_cork_full *cork,
2044                                        struct inet6_cork *v6_cork)
2045 {
2046         struct sk_buff *skb;
2047
2048         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2049                 if (skb_dst(skb))
2050                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2051                                       IPSTATS_MIB_OUTDISCARDS);
2052                 kfree_skb(skb);
2053         }
2054
2055         ip6_cork_release(cork, v6_cork);
2056 }
2057
2058 void ip6_flush_pending_frames(struct sock *sk)
2059 {
2060         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2061                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2062 }
2063 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2064
2065 struct sk_buff *ip6_make_skb(struct sock *sk,
2066                              int getfrag(void *from, char *to, int offset,
2067                                          int len, int odd, struct sk_buff *skb),
2068                              void *from, size_t length, int transhdrlen,
2069                              struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2070                              unsigned int flags, struct inet_cork_full *cork)
2071 {
2072         struct inet6_cork v6_cork;
2073         struct sk_buff_head queue;
2074         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2075         int err;
2076
2077         if (flags & MSG_PROBE) {
2078                 dst_release(&rt->dst);
2079                 return NULL;
2080         }
2081
2082         __skb_queue_head_init(&queue);
2083
2084         cork->base.flags = 0;
2085         cork->base.addr = 0;
2086         cork->base.opt = NULL;
2087         v6_cork.opt = NULL;
2088         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2089         if (err) {
2090                 ip6_cork_release(cork, &v6_cork);
2091                 return ERR_PTR(err);
2092         }
2093         if (ipc6->dontfrag < 0)
2094                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2095
2096         err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2097                                 &current->task_frag, getfrag, from,
2098                                 length + exthdrlen, transhdrlen + exthdrlen,
2099                                 flags, ipc6);
2100         if (err) {
2101                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2102                 return ERR_PTR(err);
2103         }
2104
2105         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2106 }