Merge branch 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[sfrench/cifs-2.6.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64         struct dst_entry *dst = skb_dst(skb);
65         struct net_device *dev = dst->dev;
66         struct neighbour *neigh;
67         struct in6_addr *nexthop;
68         int ret;
69
70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74                     ((mroute6_is_socket(net, skb) &&
75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77                                          &ipv6_hdr(skb)->saddr))) {
78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80                         /* Do not check for IFF_ALLMULTI; multicast routing
81                            is not supported in any case.
82                          */
83                         if (newskb)
84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85                                         net, sk, newskb, NULL, newskb->dev,
86                                         dev_loopback_xmit);
87
88                         if (ipv6_hdr(skb)->hop_limit == 0) {
89                                 IP6_INC_STATS(net, idev,
90                                               IPSTATS_MIB_OUTDISCARDS);
91                                 kfree_skb(skb);
92                                 return 0;
93                         }
94                 }
95
96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97
98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99                     IPV6_ADDR_SCOPE_NODELOCAL &&
100                     !(dev->flags & IFF_LOOPBACK)) {
101                         kfree_skb(skb);
102                         return 0;
103                 }
104         }
105
106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107                 int res = lwtunnel_xmit(skb);
108
109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110                         return res;
111         }
112
113         rcu_read_lock_bh();
114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116         if (unlikely(!neigh))
117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118         if (!IS_ERR(neigh)) {
119                 sock_confirm_neigh(skb, neigh);
120                 ret = neigh_output(neigh, skb);
121                 rcu_read_unlock_bh();
122                 return ret;
123         }
124         rcu_read_unlock_bh();
125
126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127         kfree_skb(skb);
128         return -EINVAL;
129 }
130
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133         int ret;
134
135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136         if (ret) {
137                 kfree_skb(skb);
138                 return ret;
139         }
140
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142         /* Policy lookup after SNAT yielded a new policy */
143         if (skb_dst(skb)->xfrm) {
144                 IPCB(skb)->flags |= IPSKB_REROUTED;
145                 return dst_output(net, sk, skb);
146         }
147 #endif
148
149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150             dst_allfrag(skb_dst(skb)) ||
151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
153         else
154                 return ip6_finish_output2(net, sk, skb);
155 }
156
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159         struct net_device *dev = skb_dst(skb)->dev;
160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161
162         skb->protocol = htons(ETH_P_IPV6);
163         skb->dev = dev;
164
165         if (unlikely(idev->cnf.disable_ipv6)) {
166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167                 kfree_skb(skb);
168                 return 0;
169         }
170
171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172                             net, sk, skb, NULL, dev,
173                             ip6_finish_output,
174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179         if (!np->autoflowlabel_set)
180                 return ip6_default_np_autolabel(net);
181         else
182                 return np->autoflowlabel;
183 }
184
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194         struct net *net = sock_net(sk);
195         const struct ipv6_pinfo *np = inet6_sk(sk);
196         struct in6_addr *first_hop = &fl6->daddr;
197         struct dst_entry *dst = skb_dst(skb);
198         struct ipv6hdr *hdr;
199         u8  proto = fl6->flowi6_proto;
200         int seg_len = skb->len;
201         int hlimit = -1;
202         u32 mtu;
203
204         if (opt) {
205                 unsigned int head_room;
206
207                 /* First: exthdrs may take lots of space (~8K for now)
208                    MAX_HEADER is not enough.
209                  */
210                 head_room = opt->opt_nflen + opt->opt_flen;
211                 seg_len += head_room;
212                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213
214                 if (skb_headroom(skb) < head_room) {
215                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216                         if (!skb2) {
217                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
218                                               IPSTATS_MIB_OUTDISCARDS);
219                                 kfree_skb(skb);
220                                 return -ENOBUFS;
221                         }
222                         consume_skb(skb);
223                         skb = skb2;
224                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
225                          * it is safe to call in our context (socket lock not held)
226                          */
227                         skb_set_owner_w(skb, (struct sock *)sk);
228                 }
229                 if (opt->opt_flen)
230                         ipv6_push_frag_opts(skb, opt, &proto);
231                 if (opt->opt_nflen)
232                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
233                                              &fl6->saddr);
234         }
235
236         skb_push(skb, sizeof(struct ipv6hdr));
237         skb_reset_network_header(skb);
238         hdr = ipv6_hdr(skb);
239
240         /*
241          *      Fill in the IPv6 header
242          */
243         if (np)
244                 hlimit = np->hop_limit;
245         if (hlimit < 0)
246                 hlimit = ip6_dst_hoplimit(dst);
247
248         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
249                                 ip6_autoflowlabel(net, np), fl6));
250
251         hdr->payload_len = htons(seg_len);
252         hdr->nexthdr = proto;
253         hdr->hop_limit = hlimit;
254
255         hdr->saddr = fl6->saddr;
256         hdr->daddr = *first_hop;
257
258         skb->protocol = htons(ETH_P_IPV6);
259         skb->priority = sk->sk_priority;
260         skb->mark = mark;
261
262         mtu = dst_mtu(dst);
263         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
264                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
265                               IPSTATS_MIB_OUT, skb->len);
266
267                 /* if egress device is enslaved to an L3 master device pass the
268                  * skb to its handler for processing
269                  */
270                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
271                 if (unlikely(!skb))
272                         return 0;
273
274                 /* hooks should never assume socket lock is held.
275                  * we promote our socket to non const
276                  */
277                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
278                                net, (struct sock *)sk, skb, NULL, dst->dev,
279                                dst_output);
280         }
281
282         skb->dev = dst->dev;
283         /* ipv6_local_error() does not require socket lock,
284          * we promote our socket to non const
285          */
286         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
287
288         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
289         kfree_skb(skb);
290         return -EMSGSIZE;
291 }
292 EXPORT_SYMBOL(ip6_xmit);
293
294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
295 {
296         struct ip6_ra_chain *ra;
297         struct sock *last = NULL;
298
299         read_lock(&ip6_ra_lock);
300         for (ra = ip6_ra_chain; ra; ra = ra->next) {
301                 struct sock *sk = ra->sk;
302                 if (sk && ra->sel == sel &&
303                     (!sk->sk_bound_dev_if ||
304                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
305                         if (last) {
306                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
307                                 if (skb2)
308                                         rawv6_rcv(last, skb2);
309                         }
310                         last = sk;
311                 }
312         }
313
314         if (last) {
315                 rawv6_rcv(last, skb);
316                 read_unlock(&ip6_ra_lock);
317                 return 1;
318         }
319         read_unlock(&ip6_ra_lock);
320         return 0;
321 }
322
323 static int ip6_forward_proxy_check(struct sk_buff *skb)
324 {
325         struct ipv6hdr *hdr = ipv6_hdr(skb);
326         u8 nexthdr = hdr->nexthdr;
327         __be16 frag_off;
328         int offset;
329
330         if (ipv6_ext_hdr(nexthdr)) {
331                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
332                 if (offset < 0)
333                         return 0;
334         } else
335                 offset = sizeof(struct ipv6hdr);
336
337         if (nexthdr == IPPROTO_ICMPV6) {
338                 struct icmp6hdr *icmp6;
339
340                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
341                                          offset + 1 - skb->data)))
342                         return 0;
343
344                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
345
346                 switch (icmp6->icmp6_type) {
347                 case NDISC_ROUTER_SOLICITATION:
348                 case NDISC_ROUTER_ADVERTISEMENT:
349                 case NDISC_NEIGHBOUR_SOLICITATION:
350                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
351                 case NDISC_REDIRECT:
352                         /* For reaction involving unicast neighbor discovery
353                          * message destined to the proxied address, pass it to
354                          * input function.
355                          */
356                         return 1;
357                 default:
358                         break;
359                 }
360         }
361
362         /*
363          * The proxying router can't forward traffic sent to a link-local
364          * address, so signal the sender and discard the packet. This
365          * behavior is clarified by the MIPv6 specification.
366          */
367         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
368                 dst_link_failure(skb);
369                 return -1;
370         }
371
372         return 0;
373 }
374
375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
376                                      struct sk_buff *skb)
377 {
378         return dst_output(net, sk, skb);
379 }
380
381 unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
382 {
383         unsigned int mtu;
384         struct inet6_dev *idev;
385
386         if (dst_metric_locked(dst, RTAX_MTU)) {
387                 mtu = dst_metric_raw(dst, RTAX_MTU);
388                 if (mtu)
389                         return mtu;
390         }
391
392         mtu = IPV6_MIN_MTU;
393         rcu_read_lock();
394         idev = __in6_dev_get(dst->dev);
395         if (idev)
396                 mtu = idev->cnf.mtu6;
397         rcu_read_unlock();
398
399         return mtu;
400 }
401 EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
402
403 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
404 {
405         if (skb->len <= mtu)
406                 return false;
407
408         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
409         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
410                 return true;
411
412         if (skb->ignore_df)
413                 return false;
414
415         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
416                 return false;
417
418         return true;
419 }
420
421 int ip6_forward(struct sk_buff *skb)
422 {
423         struct dst_entry *dst = skb_dst(skb);
424         struct ipv6hdr *hdr = ipv6_hdr(skb);
425         struct inet6_skb_parm *opt = IP6CB(skb);
426         struct net *net = dev_net(dst->dev);
427         u32 mtu;
428
429         if (net->ipv6.devconf_all->forwarding == 0)
430                 goto error;
431
432         if (skb->pkt_type != PACKET_HOST)
433                 goto drop;
434
435         if (unlikely(skb->sk))
436                 goto drop;
437
438         if (skb_warn_if_lro(skb))
439                 goto drop;
440
441         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
442                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
443                                 IPSTATS_MIB_INDISCARDS);
444                 goto drop;
445         }
446
447         skb_forward_csum(skb);
448
449         /*
450          *      We DO NOT make any processing on
451          *      RA packets, pushing them to user level AS IS
452          *      without ane WARRANTY that application will be able
453          *      to interpret them. The reason is that we
454          *      cannot make anything clever here.
455          *
456          *      We are not end-node, so that if packet contains
457          *      AH/ESP, we cannot make anything.
458          *      Defragmentation also would be mistake, RA packets
459          *      cannot be fragmented, because there is no warranty
460          *      that different fragments will go along one path. --ANK
461          */
462         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
463                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
464                         return 0;
465         }
466
467         /*
468          *      check and decrement ttl
469          */
470         if (hdr->hop_limit <= 1) {
471                 /* Force OUTPUT device used as source address */
472                 skb->dev = dst->dev;
473                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
474                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
475                                 IPSTATS_MIB_INHDRERRORS);
476
477                 kfree_skb(skb);
478                 return -ETIMEDOUT;
479         }
480
481         /* XXX: idev->cnf.proxy_ndp? */
482         if (net->ipv6.devconf_all->proxy_ndp &&
483             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
484                 int proxied = ip6_forward_proxy_check(skb);
485                 if (proxied > 0)
486                         return ip6_input(skb);
487                 else if (proxied < 0) {
488                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
489                                         IPSTATS_MIB_INDISCARDS);
490                         goto drop;
491                 }
492         }
493
494         if (!xfrm6_route_forward(skb)) {
495                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
496                                 IPSTATS_MIB_INDISCARDS);
497                 goto drop;
498         }
499         dst = skb_dst(skb);
500
501         /* IPv6 specs say nothing about it, but it is clear that we cannot
502            send redirects to source routed frames.
503            We don't send redirects to frames decapsulated from IPsec.
504          */
505         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
506                 struct in6_addr *target = NULL;
507                 struct inet_peer *peer;
508                 struct rt6_info *rt;
509
510                 /*
511                  *      incoming and outgoing devices are the same
512                  *      send a redirect.
513                  */
514
515                 rt = (struct rt6_info *) dst;
516                 if (rt->rt6i_flags & RTF_GATEWAY)
517                         target = &rt->rt6i_gateway;
518                 else
519                         target = &hdr->daddr;
520
521                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
522
523                 /* Limit redirects both by destination (here)
524                    and by source (inside ndisc_send_redirect)
525                  */
526                 if (inet_peer_xrlim_allow(peer, 1*HZ))
527                         ndisc_send_redirect(skb, target);
528                 if (peer)
529                         inet_putpeer(peer);
530         } else {
531                 int addrtype = ipv6_addr_type(&hdr->saddr);
532
533                 /* This check is security critical. */
534                 if (addrtype == IPV6_ADDR_ANY ||
535                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
536                         goto error;
537                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
538                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
539                                     ICMPV6_NOT_NEIGHBOUR, 0);
540                         goto error;
541                 }
542         }
543
544         mtu = ip6_dst_mtu_forward(dst);
545         if (mtu < IPV6_MIN_MTU)
546                 mtu = IPV6_MIN_MTU;
547
548         if (ip6_pkt_too_big(skb, mtu)) {
549                 /* Again, force OUTPUT device used as source address */
550                 skb->dev = dst->dev;
551                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
552                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
553                                 IPSTATS_MIB_INTOOBIGERRORS);
554                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
555                                 IPSTATS_MIB_FRAGFAILS);
556                 kfree_skb(skb);
557                 return -EMSGSIZE;
558         }
559
560         if (skb_cow(skb, dst->dev->hard_header_len)) {
561                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
562                                 IPSTATS_MIB_OUTDISCARDS);
563                 goto drop;
564         }
565
566         hdr = ipv6_hdr(skb);
567
568         /* Mangling hops number delayed to point after skb COW */
569
570         hdr->hop_limit--;
571
572         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
573         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
574         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
575                        net, NULL, skb, skb->dev, dst->dev,
576                        ip6_forward_finish);
577
578 error:
579         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
580 drop:
581         kfree_skb(skb);
582         return -EINVAL;
583 }
584
585 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
586 {
587         to->pkt_type = from->pkt_type;
588         to->priority = from->priority;
589         to->protocol = from->protocol;
590         skb_dst_drop(to);
591         skb_dst_set(to, dst_clone(skb_dst(from)));
592         to->dev = from->dev;
593         to->mark = from->mark;
594
595 #ifdef CONFIG_NET_SCHED
596         to->tc_index = from->tc_index;
597 #endif
598         nf_copy(to, from);
599         skb_copy_secmark(to, from);
600 }
601
602 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
603                  int (*output)(struct net *, struct sock *, struct sk_buff *))
604 {
605         struct sk_buff *frag;
606         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
607         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
608                                 inet6_sk(skb->sk) : NULL;
609         struct ipv6hdr *tmp_hdr;
610         struct frag_hdr *fh;
611         unsigned int mtu, hlen, left, len;
612         int hroom, troom;
613         __be32 frag_id;
614         int ptr, offset = 0, err = 0;
615         u8 *prevhdr, nexthdr = 0;
616
617         err = ip6_find_1stfragopt(skb, &prevhdr);
618         if (err < 0)
619                 goto fail;
620         hlen = err;
621         nexthdr = *prevhdr;
622
623         mtu = ip6_skb_dst_mtu(skb);
624
625         /* We must not fragment if the socket is set to force MTU discovery
626          * or if the skb it not generated by a local socket.
627          */
628         if (unlikely(!skb->ignore_df && skb->len > mtu))
629                 goto fail_toobig;
630
631         if (IP6CB(skb)->frag_max_size) {
632                 if (IP6CB(skb)->frag_max_size > mtu)
633                         goto fail_toobig;
634
635                 /* don't send fragments larger than what we received */
636                 mtu = IP6CB(skb)->frag_max_size;
637                 if (mtu < IPV6_MIN_MTU)
638                         mtu = IPV6_MIN_MTU;
639         }
640
641         if (np && np->frag_size < mtu) {
642                 if (np->frag_size)
643                         mtu = np->frag_size;
644         }
645         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
646                 goto fail_toobig;
647         mtu -= hlen + sizeof(struct frag_hdr);
648
649         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
650                                     &ipv6_hdr(skb)->saddr);
651
652         if (skb->ip_summed == CHECKSUM_PARTIAL &&
653             (err = skb_checksum_help(skb)))
654                 goto fail;
655
656         hroom = LL_RESERVED_SPACE(rt->dst.dev);
657         if (skb_has_frag_list(skb)) {
658                 unsigned int first_len = skb_pagelen(skb);
659                 struct sk_buff *frag2;
660
661                 if (first_len - hlen > mtu ||
662                     ((first_len - hlen) & 7) ||
663                     skb_cloned(skb) ||
664                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
665                         goto slow_path;
666
667                 skb_walk_frags(skb, frag) {
668                         /* Correct geometry. */
669                         if (frag->len > mtu ||
670                             ((frag->len & 7) && frag->next) ||
671                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
672                                 goto slow_path_clean;
673
674                         /* Partially cloned skb? */
675                         if (skb_shared(frag))
676                                 goto slow_path_clean;
677
678                         BUG_ON(frag->sk);
679                         if (skb->sk) {
680                                 frag->sk = skb->sk;
681                                 frag->destructor = sock_wfree;
682                         }
683                         skb->truesize -= frag->truesize;
684                 }
685
686                 err = 0;
687                 offset = 0;
688                 /* BUILD HEADER */
689
690                 *prevhdr = NEXTHDR_FRAGMENT;
691                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
692                 if (!tmp_hdr) {
693                         err = -ENOMEM;
694                         goto fail;
695                 }
696                 frag = skb_shinfo(skb)->frag_list;
697                 skb_frag_list_init(skb);
698
699                 __skb_pull(skb, hlen);
700                 fh = __skb_push(skb, sizeof(struct frag_hdr));
701                 __skb_push(skb, hlen);
702                 skb_reset_network_header(skb);
703                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
704
705                 fh->nexthdr = nexthdr;
706                 fh->reserved = 0;
707                 fh->frag_off = htons(IP6_MF);
708                 fh->identification = frag_id;
709
710                 first_len = skb_pagelen(skb);
711                 skb->data_len = first_len - skb_headlen(skb);
712                 skb->len = first_len;
713                 ipv6_hdr(skb)->payload_len = htons(first_len -
714                                                    sizeof(struct ipv6hdr));
715
716                 for (;;) {
717                         /* Prepare header of the next frame,
718                          * before previous one went down. */
719                         if (frag) {
720                                 frag->ip_summed = CHECKSUM_NONE;
721                                 skb_reset_transport_header(frag);
722                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
723                                 __skb_push(frag, hlen);
724                                 skb_reset_network_header(frag);
725                                 memcpy(skb_network_header(frag), tmp_hdr,
726                                        hlen);
727                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
728                                 fh->nexthdr = nexthdr;
729                                 fh->reserved = 0;
730                                 fh->frag_off = htons(offset);
731                                 if (frag->next)
732                                         fh->frag_off |= htons(IP6_MF);
733                                 fh->identification = frag_id;
734                                 ipv6_hdr(frag)->payload_len =
735                                                 htons(frag->len -
736                                                       sizeof(struct ipv6hdr));
737                                 ip6_copy_metadata(frag, skb);
738                         }
739
740                         err = output(net, sk, skb);
741                         if (!err)
742                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
743                                               IPSTATS_MIB_FRAGCREATES);
744
745                         if (err || !frag)
746                                 break;
747
748                         skb = frag;
749                         frag = skb->next;
750                         skb->next = NULL;
751                 }
752
753                 kfree(tmp_hdr);
754
755                 if (err == 0) {
756                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
757                                       IPSTATS_MIB_FRAGOKS);
758                         return 0;
759                 }
760
761                 kfree_skb_list(frag);
762
763                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
764                               IPSTATS_MIB_FRAGFAILS);
765                 return err;
766
767 slow_path_clean:
768                 skb_walk_frags(skb, frag2) {
769                         if (frag2 == frag)
770                                 break;
771                         frag2->sk = NULL;
772                         frag2->destructor = NULL;
773                         skb->truesize += frag2->truesize;
774                 }
775         }
776
777 slow_path:
778         left = skb->len - hlen;         /* Space per frame */
779         ptr = hlen;                     /* Where to start from */
780
781         /*
782          *      Fragment the datagram.
783          */
784
785         troom = rt->dst.dev->needed_tailroom;
786
787         /*
788          *      Keep copying data until we run out.
789          */
790         while (left > 0)        {
791                 u8 *fragnexthdr_offset;
792
793                 len = left;
794                 /* IF: it doesn't fit, use 'mtu' - the data space left */
795                 if (len > mtu)
796                         len = mtu;
797                 /* IF: we are not sending up to and including the packet end
798                    then align the next start on an eight byte boundary */
799                 if (len < left) {
800                         len &= ~7;
801                 }
802
803                 /* Allocate buffer */
804                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
805                                  hroom + troom, GFP_ATOMIC);
806                 if (!frag) {
807                         err = -ENOMEM;
808                         goto fail;
809                 }
810
811                 /*
812                  *      Set up data on packet
813                  */
814
815                 ip6_copy_metadata(frag, skb);
816                 skb_reserve(frag, hroom);
817                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
818                 skb_reset_network_header(frag);
819                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
820                 frag->transport_header = (frag->network_header + hlen +
821                                           sizeof(struct frag_hdr));
822
823                 /*
824                  *      Charge the memory for the fragment to any owner
825                  *      it might possess
826                  */
827                 if (skb->sk)
828                         skb_set_owner_w(frag, skb->sk);
829
830                 /*
831                  *      Copy the packet header into the new buffer.
832                  */
833                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
834
835                 fragnexthdr_offset = skb_network_header(frag);
836                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
837                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
838
839                 /*
840                  *      Build fragment header.
841                  */
842                 fh->nexthdr = nexthdr;
843                 fh->reserved = 0;
844                 fh->identification = frag_id;
845
846                 /*
847                  *      Copy a block of the IP datagram.
848                  */
849                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
850                                      len));
851                 left -= len;
852
853                 fh->frag_off = htons(offset);
854                 if (left > 0)
855                         fh->frag_off |= htons(IP6_MF);
856                 ipv6_hdr(frag)->payload_len = htons(frag->len -
857                                                     sizeof(struct ipv6hdr));
858
859                 ptr += len;
860                 offset += len;
861
862                 /*
863                  *      Put this fragment into the sending queue.
864                  */
865                 err = output(net, sk, frag);
866                 if (err)
867                         goto fail;
868
869                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
870                               IPSTATS_MIB_FRAGCREATES);
871         }
872         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
873                       IPSTATS_MIB_FRAGOKS);
874         consume_skb(skb);
875         return err;
876
877 fail_toobig:
878         if (skb->sk && dst_allfrag(skb_dst(skb)))
879                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
880
881         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
882         err = -EMSGSIZE;
883
884 fail:
885         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
886                       IPSTATS_MIB_FRAGFAILS);
887         kfree_skb(skb);
888         return err;
889 }
890
891 static inline int ip6_rt_check(const struct rt6key *rt_key,
892                                const struct in6_addr *fl_addr,
893                                const struct in6_addr *addr_cache)
894 {
895         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
896                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
897 }
898
899 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
900                                           struct dst_entry *dst,
901                                           const struct flowi6 *fl6)
902 {
903         struct ipv6_pinfo *np = inet6_sk(sk);
904         struct rt6_info *rt;
905
906         if (!dst)
907                 goto out;
908
909         if (dst->ops->family != AF_INET6) {
910                 dst_release(dst);
911                 return NULL;
912         }
913
914         rt = (struct rt6_info *)dst;
915         /* Yes, checking route validity in not connected
916          * case is not very simple. Take into account,
917          * that we do not support routing by source, TOS,
918          * and MSG_DONTROUTE            --ANK (980726)
919          *
920          * 1. ip6_rt_check(): If route was host route,
921          *    check that cached destination is current.
922          *    If it is network route, we still may
923          *    check its validity using saved pointer
924          *    to the last used address: daddr_cache.
925          *    We do not want to save whole address now,
926          *    (because main consumer of this service
927          *    is tcp, which has not this problem),
928          *    so that the last trick works only on connected
929          *    sockets.
930          * 2. oif also should be the same.
931          */
932         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
933 #ifdef CONFIG_IPV6_SUBTREES
934             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
935 #endif
936            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
937               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
938                 dst_release(dst);
939                 dst = NULL;
940         }
941
942 out:
943         return dst;
944 }
945
946 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
947                                struct dst_entry **dst, struct flowi6 *fl6)
948 {
949 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
950         struct neighbour *n;
951         struct rt6_info *rt;
952 #endif
953         int err;
954         int flags = 0;
955
956         /* The correct way to handle this would be to do
957          * ip6_route_get_saddr, and then ip6_route_output; however,
958          * the route-specific preferred source forces the
959          * ip6_route_output call _before_ ip6_route_get_saddr.
960          *
961          * In source specific routing (no src=any default route),
962          * ip6_route_output will fail given src=any saddr, though, so
963          * that's why we try it again later.
964          */
965         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
966                 struct rt6_info *rt;
967                 bool had_dst = *dst != NULL;
968
969                 if (!had_dst)
970                         *dst = ip6_route_output(net, sk, fl6);
971                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
972                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
973                                           sk ? inet6_sk(sk)->srcprefs : 0,
974                                           &fl6->saddr);
975                 if (err)
976                         goto out_err_release;
977
978                 /* If we had an erroneous initial result, pretend it
979                  * never existed and let the SA-enabled version take
980                  * over.
981                  */
982                 if (!had_dst && (*dst)->error) {
983                         dst_release(*dst);
984                         *dst = NULL;
985                 }
986
987                 if (fl6->flowi6_oif)
988                         flags |= RT6_LOOKUP_F_IFACE;
989         }
990
991         if (!*dst)
992                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
993
994         err = (*dst)->error;
995         if (err)
996                 goto out_err_release;
997
998 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
999         /*
1000          * Here if the dst entry we've looked up
1001          * has a neighbour entry that is in the INCOMPLETE
1002          * state and the src address from the flow is
1003          * marked as OPTIMISTIC, we release the found
1004          * dst entry and replace it instead with the
1005          * dst entry of the nexthop router
1006          */
1007         rt = (struct rt6_info *) *dst;
1008         rcu_read_lock_bh();
1009         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1010                                       rt6_nexthop(rt, &fl6->daddr));
1011         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1012         rcu_read_unlock_bh();
1013
1014         if (err) {
1015                 struct inet6_ifaddr *ifp;
1016                 struct flowi6 fl_gw6;
1017                 int redirect;
1018
1019                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1020                                       (*dst)->dev, 1);
1021
1022                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1023                 if (ifp)
1024                         in6_ifa_put(ifp);
1025
1026                 if (redirect) {
1027                         /*
1028                          * We need to get the dst entry for the
1029                          * default router instead
1030                          */
1031                         dst_release(*dst);
1032                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1033                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1034                         *dst = ip6_route_output(net, sk, &fl_gw6);
1035                         err = (*dst)->error;
1036                         if (err)
1037                                 goto out_err_release;
1038                 }
1039         }
1040 #endif
1041         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1042             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1043                 err = -EAFNOSUPPORT;
1044                 goto out_err_release;
1045         }
1046
1047         return 0;
1048
1049 out_err_release:
1050         dst_release(*dst);
1051         *dst = NULL;
1052
1053         if (err == -ENETUNREACH)
1054                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1055         return err;
1056 }
1057
1058 /**
1059  *      ip6_dst_lookup - perform route lookup on flow
1060  *      @sk: socket which provides route info
1061  *      @dst: pointer to dst_entry * for result
1062  *      @fl6: flow to lookup
1063  *
1064  *      This function performs a route lookup on the given flow.
1065  *
1066  *      It returns zero on success, or a standard errno code on error.
1067  */
1068 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1069                    struct flowi6 *fl6)
1070 {
1071         *dst = NULL;
1072         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1073 }
1074 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1075
1076 /**
1077  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1078  *      @sk: socket which provides route info
1079  *      @fl6: flow to lookup
1080  *      @final_dst: final destination address for ipsec lookup
1081  *
1082  *      This function performs a route lookup on the given flow.
1083  *
1084  *      It returns a valid dst pointer on success, or a pointer encoded
1085  *      error code.
1086  */
1087 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1088                                       const struct in6_addr *final_dst)
1089 {
1090         struct dst_entry *dst = NULL;
1091         int err;
1092
1093         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1094         if (err)
1095                 return ERR_PTR(err);
1096         if (final_dst)
1097                 fl6->daddr = *final_dst;
1098
1099         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1100 }
1101 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1102
1103 /**
1104  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1105  *      @sk: socket which provides the dst cache and route info
1106  *      @fl6: flow to lookup
1107  *      @final_dst: final destination address for ipsec lookup
1108  *
1109  *      This function performs a route lookup on the given flow with the
1110  *      possibility of using the cached route in the socket if it is valid.
1111  *      It will take the socket dst lock when operating on the dst cache.
1112  *      As a result, this function can only be used in process context.
1113  *
1114  *      It returns a valid dst pointer on success, or a pointer encoded
1115  *      error code.
1116  */
1117 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1118                                          const struct in6_addr *final_dst)
1119 {
1120         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1121
1122         dst = ip6_sk_dst_check(sk, dst, fl6);
1123         if (!dst)
1124                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1125
1126         return dst;
1127 }
1128 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1129
1130 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1131                                                gfp_t gfp)
1132 {
1133         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1134 }
1135
1136 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1137                                                 gfp_t gfp)
1138 {
1139         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1140 }
1141
1142 static void ip6_append_data_mtu(unsigned int *mtu,
1143                                 int *maxfraglen,
1144                                 unsigned int fragheaderlen,
1145                                 struct sk_buff *skb,
1146                                 struct rt6_info *rt,
1147                                 unsigned int orig_mtu)
1148 {
1149         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1150                 if (!skb) {
1151                         /* first fragment, reserve header_len */
1152                         *mtu = orig_mtu - rt->dst.header_len;
1153
1154                 } else {
1155                         /*
1156                          * this fragment is not first, the headers
1157                          * space is regarded as data space.
1158                          */
1159                         *mtu = orig_mtu;
1160                 }
1161                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1162                               + fragheaderlen - sizeof(struct frag_hdr);
1163         }
1164 }
1165
1166 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1167                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1168                           struct rt6_info *rt, struct flowi6 *fl6)
1169 {
1170         struct ipv6_pinfo *np = inet6_sk(sk);
1171         unsigned int mtu;
1172         struct ipv6_txoptions *opt = ipc6->opt;
1173
1174         /*
1175          * setup for corking
1176          */
1177         if (opt) {
1178                 if (WARN_ON(v6_cork->opt))
1179                         return -EINVAL;
1180
1181                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1182                 if (unlikely(!v6_cork->opt))
1183                         return -ENOBUFS;
1184
1185                 v6_cork->opt->tot_len = sizeof(*opt);
1186                 v6_cork->opt->opt_flen = opt->opt_flen;
1187                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1188
1189                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1190                                                     sk->sk_allocation);
1191                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1192                         return -ENOBUFS;
1193
1194                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1195                                                     sk->sk_allocation);
1196                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1197                         return -ENOBUFS;
1198
1199                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1200                                                    sk->sk_allocation);
1201                 if (opt->hopopt && !v6_cork->opt->hopopt)
1202                         return -ENOBUFS;
1203
1204                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1205                                                     sk->sk_allocation);
1206                 if (opt->srcrt && !v6_cork->opt->srcrt)
1207                         return -ENOBUFS;
1208
1209                 /* need source address above miyazawa*/
1210         }
1211         dst_hold(&rt->dst);
1212         cork->base.dst = &rt->dst;
1213         cork->fl.u.ip6 = *fl6;
1214         v6_cork->hop_limit = ipc6->hlimit;
1215         v6_cork->tclass = ipc6->tclass;
1216         if (rt->dst.flags & DST_XFRM_TUNNEL)
1217                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1218                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1219         else
1220                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1221                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1222         if (np->frag_size < mtu) {
1223                 if (np->frag_size)
1224                         mtu = np->frag_size;
1225         }
1226         if (mtu < IPV6_MIN_MTU)
1227                 return -EINVAL;
1228         cork->base.fragsize = mtu;
1229         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1230                 cork->base.flags |= IPCORK_ALLFRAG;
1231         cork->base.length = 0;
1232
1233         return 0;
1234 }
1235
1236 static int __ip6_append_data(struct sock *sk,
1237                              struct flowi6 *fl6,
1238                              struct sk_buff_head *queue,
1239                              struct inet_cork *cork,
1240                              struct inet6_cork *v6_cork,
1241                              struct page_frag *pfrag,
1242                              int getfrag(void *from, char *to, int offset,
1243                                          int len, int odd, struct sk_buff *skb),
1244                              void *from, int length, int transhdrlen,
1245                              unsigned int flags, struct ipcm6_cookie *ipc6,
1246                              const struct sockcm_cookie *sockc)
1247 {
1248         struct sk_buff *skb, *skb_prev = NULL;
1249         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1250         int exthdrlen = 0;
1251         int dst_exthdrlen = 0;
1252         int hh_len;
1253         int copy;
1254         int err;
1255         int offset = 0;
1256         __u8 tx_flags = 0;
1257         u32 tskey = 0;
1258         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1259         struct ipv6_txoptions *opt = v6_cork->opt;
1260         int csummode = CHECKSUM_NONE;
1261         unsigned int maxnonfragsize, headersize;
1262         unsigned int wmem_alloc_delta = 0;
1263
1264         skb = skb_peek_tail(queue);
1265         if (!skb) {
1266                 exthdrlen = opt ? opt->opt_flen : 0;
1267                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1268         }
1269
1270         mtu = cork->fragsize;
1271         orig_mtu = mtu;
1272
1273         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1274
1275         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1276                         (opt ? opt->opt_nflen : 0);
1277         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1278                      sizeof(struct frag_hdr);
1279
1280         headersize = sizeof(struct ipv6hdr) +
1281                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1282                      (dst_allfrag(&rt->dst) ?
1283                       sizeof(struct frag_hdr) : 0) +
1284                      rt->rt6i_nfheader_len;
1285
1286         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1287          * the first fragment
1288          */
1289         if (headersize + transhdrlen > mtu)
1290                 goto emsgsize;
1291
1292         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1293             (sk->sk_protocol == IPPROTO_UDP ||
1294              sk->sk_protocol == IPPROTO_RAW)) {
1295                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1296                                 sizeof(struct ipv6hdr));
1297                 goto emsgsize;
1298         }
1299
1300         if (ip6_sk_ignore_df(sk))
1301                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1302         else
1303                 maxnonfragsize = mtu;
1304
1305         if (cork->length + length > maxnonfragsize - headersize) {
1306 emsgsize:
1307                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1308                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1309                 return -EMSGSIZE;
1310         }
1311
1312         /* CHECKSUM_PARTIAL only with no extension headers and when
1313          * we are not going to fragment
1314          */
1315         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1316             headersize == sizeof(struct ipv6hdr) &&
1317             length <= mtu - headersize &&
1318             !(flags & MSG_MORE) &&
1319             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1320                 csummode = CHECKSUM_PARTIAL;
1321
1322         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1323                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1324                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1325                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1326                         tskey = sk->sk_tskey++;
1327         }
1328
1329         /*
1330          * Let's try using as much space as possible.
1331          * Use MTU if total length of the message fits into the MTU.
1332          * Otherwise, we need to reserve fragment header and
1333          * fragment alignment (= 8-15 octects, in total).
1334          *
1335          * Note that we may need to "move" the data from the tail of
1336          * of the buffer to the new fragment when we split
1337          * the message.
1338          *
1339          * FIXME: It may be fragmented into multiple chunks
1340          *        at once if non-fragmentable extension headers
1341          *        are too large.
1342          * --yoshfuji
1343          */
1344
1345         cork->length += length;
1346         if (!skb)
1347                 goto alloc_new_skb;
1348
1349         while (length > 0) {
1350                 /* Check if the remaining data fits into current packet. */
1351                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1352                 if (copy < length)
1353                         copy = maxfraglen - skb->len;
1354
1355                 if (copy <= 0) {
1356                         char *data;
1357                         unsigned int datalen;
1358                         unsigned int fraglen;
1359                         unsigned int fraggap;
1360                         unsigned int alloclen;
1361 alloc_new_skb:
1362                         /* There's no room in the current skb */
1363                         if (skb)
1364                                 fraggap = skb->len - maxfraglen;
1365                         else
1366                                 fraggap = 0;
1367                         /* update mtu and maxfraglen if necessary */
1368                         if (!skb || !skb_prev)
1369                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1370                                                     fragheaderlen, skb, rt,
1371                                                     orig_mtu);
1372
1373                         skb_prev = skb;
1374
1375                         /*
1376                          * If remaining data exceeds the mtu,
1377                          * we know we need more fragment(s).
1378                          */
1379                         datalen = length + fraggap;
1380
1381                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1382                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1383                         if ((flags & MSG_MORE) &&
1384                             !(rt->dst.dev->features&NETIF_F_SG))
1385                                 alloclen = mtu;
1386                         else
1387                                 alloclen = datalen + fragheaderlen;
1388
1389                         alloclen += dst_exthdrlen;
1390
1391                         if (datalen != length + fraggap) {
1392                                 /*
1393                                  * this is not the last fragment, the trailer
1394                                  * space is regarded as data space.
1395                                  */
1396                                 datalen += rt->dst.trailer_len;
1397                         }
1398
1399                         alloclen += rt->dst.trailer_len;
1400                         fraglen = datalen + fragheaderlen;
1401
1402                         /*
1403                          * We just reserve space for fragment header.
1404                          * Note: this may be overallocation if the message
1405                          * (without MSG_MORE) fits into the MTU.
1406                          */
1407                         alloclen += sizeof(struct frag_hdr);
1408
1409                         copy = datalen - transhdrlen - fraggap;
1410                         if (copy < 0) {
1411                                 err = -EINVAL;
1412                                 goto error;
1413                         }
1414                         if (transhdrlen) {
1415                                 skb = sock_alloc_send_skb(sk,
1416                                                 alloclen + hh_len,
1417                                                 (flags & MSG_DONTWAIT), &err);
1418                         } else {
1419                                 skb = NULL;
1420                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1421                                     2 * sk->sk_sndbuf)
1422                                         skb = alloc_skb(alloclen + hh_len,
1423                                                         sk->sk_allocation);
1424                                 if (unlikely(!skb))
1425                                         err = -ENOBUFS;
1426                         }
1427                         if (!skb)
1428                                 goto error;
1429                         /*
1430                          *      Fill in the control structures
1431                          */
1432                         skb->protocol = htons(ETH_P_IPV6);
1433                         skb->ip_summed = csummode;
1434                         skb->csum = 0;
1435                         /* reserve for fragmentation and ipsec header */
1436                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1437                                     dst_exthdrlen);
1438
1439                         /* Only the initial fragment is time stamped */
1440                         skb_shinfo(skb)->tx_flags = tx_flags;
1441                         tx_flags = 0;
1442                         skb_shinfo(skb)->tskey = tskey;
1443                         tskey = 0;
1444
1445                         /*
1446                          *      Find where to start putting bytes
1447                          */
1448                         data = skb_put(skb, fraglen);
1449                         skb_set_network_header(skb, exthdrlen);
1450                         data += fragheaderlen;
1451                         skb->transport_header = (skb->network_header +
1452                                                  fragheaderlen);
1453                         if (fraggap) {
1454                                 skb->csum = skb_copy_and_csum_bits(
1455                                         skb_prev, maxfraglen,
1456                                         data + transhdrlen, fraggap, 0);
1457                                 skb_prev->csum = csum_sub(skb_prev->csum,
1458                                                           skb->csum);
1459                                 data += fraggap;
1460                                 pskb_trim_unique(skb_prev, maxfraglen);
1461                         }
1462                         if (copy > 0 &&
1463                             getfrag(from, data + transhdrlen, offset,
1464                                     copy, fraggap, skb) < 0) {
1465                                 err = -EFAULT;
1466                                 kfree_skb(skb);
1467                                 goto error;
1468                         }
1469
1470                         offset += copy;
1471                         length -= datalen - fraggap;
1472                         transhdrlen = 0;
1473                         exthdrlen = 0;
1474                         dst_exthdrlen = 0;
1475
1476                         if ((flags & MSG_CONFIRM) && !skb_prev)
1477                                 skb_set_dst_pending_confirm(skb, 1);
1478
1479                         /*
1480                          * Put the packet on the pending queue
1481                          */
1482                         if (!skb->destructor) {
1483                                 skb->destructor = sock_wfree;
1484                                 skb->sk = sk;
1485                                 wmem_alloc_delta += skb->truesize;
1486                         }
1487                         __skb_queue_tail(queue, skb);
1488                         continue;
1489                 }
1490
1491                 if (copy > length)
1492                         copy = length;
1493
1494                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1495                         unsigned int off;
1496
1497                         off = skb->len;
1498                         if (getfrag(from, skb_put(skb, copy),
1499                                                 offset, copy, off, skb) < 0) {
1500                                 __skb_trim(skb, off);
1501                                 err = -EFAULT;
1502                                 goto error;
1503                         }
1504                 } else {
1505                         int i = skb_shinfo(skb)->nr_frags;
1506
1507                         err = -ENOMEM;
1508                         if (!sk_page_frag_refill(sk, pfrag))
1509                                 goto error;
1510
1511                         if (!skb_can_coalesce(skb, i, pfrag->page,
1512                                               pfrag->offset)) {
1513                                 err = -EMSGSIZE;
1514                                 if (i == MAX_SKB_FRAGS)
1515                                         goto error;
1516
1517                                 __skb_fill_page_desc(skb, i, pfrag->page,
1518                                                      pfrag->offset, 0);
1519                                 skb_shinfo(skb)->nr_frags = ++i;
1520                                 get_page(pfrag->page);
1521                         }
1522                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1523                         if (getfrag(from,
1524                                     page_address(pfrag->page) + pfrag->offset,
1525                                     offset, copy, skb->len, skb) < 0)
1526                                 goto error_efault;
1527
1528                         pfrag->offset += copy;
1529                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1530                         skb->len += copy;
1531                         skb->data_len += copy;
1532                         skb->truesize += copy;
1533                         wmem_alloc_delta += copy;
1534                 }
1535                 offset += copy;
1536                 length -= copy;
1537         }
1538
1539         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1540         return 0;
1541
1542 error_efault:
1543         err = -EFAULT;
1544 error:
1545         cork->length -= length;
1546         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1547         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1548         return err;
1549 }
1550
1551 int ip6_append_data(struct sock *sk,
1552                     int getfrag(void *from, char *to, int offset, int len,
1553                                 int odd, struct sk_buff *skb),
1554                     void *from, int length, int transhdrlen,
1555                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1556                     struct rt6_info *rt, unsigned int flags,
1557                     const struct sockcm_cookie *sockc)
1558 {
1559         struct inet_sock *inet = inet_sk(sk);
1560         struct ipv6_pinfo *np = inet6_sk(sk);
1561         int exthdrlen;
1562         int err;
1563
1564         if (flags&MSG_PROBE)
1565                 return 0;
1566         if (skb_queue_empty(&sk->sk_write_queue)) {
1567                 /*
1568                  * setup for corking
1569                  */
1570                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1571                                      ipc6, rt, fl6);
1572                 if (err)
1573                         return err;
1574
1575                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1576                 length += exthdrlen;
1577                 transhdrlen += exthdrlen;
1578         } else {
1579                 fl6 = &inet->cork.fl.u.ip6;
1580                 transhdrlen = 0;
1581         }
1582
1583         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1584                                  &np->cork, sk_page_frag(sk), getfrag,
1585                                  from, length, transhdrlen, flags, ipc6, sockc);
1586 }
1587 EXPORT_SYMBOL_GPL(ip6_append_data);
1588
1589 static void ip6_cork_release(struct inet_cork_full *cork,
1590                              struct inet6_cork *v6_cork)
1591 {
1592         if (v6_cork->opt) {
1593                 kfree(v6_cork->opt->dst0opt);
1594                 kfree(v6_cork->opt->dst1opt);
1595                 kfree(v6_cork->opt->hopopt);
1596                 kfree(v6_cork->opt->srcrt);
1597                 kfree(v6_cork->opt);
1598                 v6_cork->opt = NULL;
1599         }
1600
1601         if (cork->base.dst) {
1602                 dst_release(cork->base.dst);
1603                 cork->base.dst = NULL;
1604                 cork->base.flags &= ~IPCORK_ALLFRAG;
1605         }
1606         memset(&cork->fl, 0, sizeof(cork->fl));
1607 }
1608
1609 struct sk_buff *__ip6_make_skb(struct sock *sk,
1610                                struct sk_buff_head *queue,
1611                                struct inet_cork_full *cork,
1612                                struct inet6_cork *v6_cork)
1613 {
1614         struct sk_buff *skb, *tmp_skb;
1615         struct sk_buff **tail_skb;
1616         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1617         struct ipv6_pinfo *np = inet6_sk(sk);
1618         struct net *net = sock_net(sk);
1619         struct ipv6hdr *hdr;
1620         struct ipv6_txoptions *opt = v6_cork->opt;
1621         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1622         struct flowi6 *fl6 = &cork->fl.u.ip6;
1623         unsigned char proto = fl6->flowi6_proto;
1624
1625         skb = __skb_dequeue(queue);
1626         if (!skb)
1627                 goto out;
1628         tail_skb = &(skb_shinfo(skb)->frag_list);
1629
1630         /* move skb->data to ip header from ext header */
1631         if (skb->data < skb_network_header(skb))
1632                 __skb_pull(skb, skb_network_offset(skb));
1633         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1634                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1635                 *tail_skb = tmp_skb;
1636                 tail_skb = &(tmp_skb->next);
1637                 skb->len += tmp_skb->len;
1638                 skb->data_len += tmp_skb->len;
1639                 skb->truesize += tmp_skb->truesize;
1640                 tmp_skb->destructor = NULL;
1641                 tmp_skb->sk = NULL;
1642         }
1643
1644         /* Allow local fragmentation. */
1645         skb->ignore_df = ip6_sk_ignore_df(sk);
1646
1647         *final_dst = fl6->daddr;
1648         __skb_pull(skb, skb_network_header_len(skb));
1649         if (opt && opt->opt_flen)
1650                 ipv6_push_frag_opts(skb, opt, &proto);
1651         if (opt && opt->opt_nflen)
1652                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1653
1654         skb_push(skb, sizeof(struct ipv6hdr));
1655         skb_reset_network_header(skb);
1656         hdr = ipv6_hdr(skb);
1657
1658         ip6_flow_hdr(hdr, v6_cork->tclass,
1659                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1660                                         ip6_autoflowlabel(net, np), fl6));
1661         hdr->hop_limit = v6_cork->hop_limit;
1662         hdr->nexthdr = proto;
1663         hdr->saddr = fl6->saddr;
1664         hdr->daddr = *final_dst;
1665
1666         skb->priority = sk->sk_priority;
1667         skb->mark = sk->sk_mark;
1668
1669         skb_dst_set(skb, dst_clone(&rt->dst));
1670         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1671         if (proto == IPPROTO_ICMPV6) {
1672                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1673
1674                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1675                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1676         }
1677
1678         ip6_cork_release(cork, v6_cork);
1679 out:
1680         return skb;
1681 }
1682
1683 int ip6_send_skb(struct sk_buff *skb)
1684 {
1685         struct net *net = sock_net(skb->sk);
1686         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1687         int err;
1688
1689         err = ip6_local_out(net, skb->sk, skb);
1690         if (err) {
1691                 if (err > 0)
1692                         err = net_xmit_errno(err);
1693                 if (err)
1694                         IP6_INC_STATS(net, rt->rt6i_idev,
1695                                       IPSTATS_MIB_OUTDISCARDS);
1696         }
1697
1698         return err;
1699 }
1700
1701 int ip6_push_pending_frames(struct sock *sk)
1702 {
1703         struct sk_buff *skb;
1704
1705         skb = ip6_finish_skb(sk);
1706         if (!skb)
1707                 return 0;
1708
1709         return ip6_send_skb(skb);
1710 }
1711 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1712
1713 static void __ip6_flush_pending_frames(struct sock *sk,
1714                                        struct sk_buff_head *queue,
1715                                        struct inet_cork_full *cork,
1716                                        struct inet6_cork *v6_cork)
1717 {
1718         struct sk_buff *skb;
1719
1720         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1721                 if (skb_dst(skb))
1722                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1723                                       IPSTATS_MIB_OUTDISCARDS);
1724                 kfree_skb(skb);
1725         }
1726
1727         ip6_cork_release(cork, v6_cork);
1728 }
1729
1730 void ip6_flush_pending_frames(struct sock *sk)
1731 {
1732         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1733                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1734 }
1735 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1736
1737 struct sk_buff *ip6_make_skb(struct sock *sk,
1738                              int getfrag(void *from, char *to, int offset,
1739                                          int len, int odd, struct sk_buff *skb),
1740                              void *from, int length, int transhdrlen,
1741                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1742                              struct rt6_info *rt, unsigned int flags,
1743                              const struct sockcm_cookie *sockc)
1744 {
1745         struct inet_cork_full cork;
1746         struct inet6_cork v6_cork;
1747         struct sk_buff_head queue;
1748         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1749         int err;
1750
1751         if (flags & MSG_PROBE)
1752                 return NULL;
1753
1754         __skb_queue_head_init(&queue);
1755
1756         cork.base.flags = 0;
1757         cork.base.addr = 0;
1758         cork.base.opt = NULL;
1759         cork.base.dst = NULL;
1760         v6_cork.opt = NULL;
1761         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1762         if (err) {
1763                 ip6_cork_release(&cork, &v6_cork);
1764                 return ERR_PTR(err);
1765         }
1766         if (ipc6->dontfrag < 0)
1767                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1768
1769         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1770                                 &current->task_frag, getfrag, from,
1771                                 length + exthdrlen, transhdrlen + exthdrlen,
1772                                 flags, ipc6, sockc);
1773         if (err) {
1774                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1775                 return ERR_PTR(err);
1776         }
1777
1778         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1779 }