Merge branch 'x86-apic-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[sfrench/cifs-2.6.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64         struct dst_entry *dst = skb_dst(skb);
65         struct net_device *dev = dst->dev;
66         struct neighbour *neigh;
67         struct in6_addr *nexthop;
68         int ret;
69
70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74                     ((mroute6_socket(net, skb) &&
75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77                                          &ipv6_hdr(skb)->saddr))) {
78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80                         /* Do not check for IFF_ALLMULTI; multicast routing
81                            is not supported in any case.
82                          */
83                         if (newskb)
84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85                                         net, sk, newskb, NULL, newskb->dev,
86                                         dev_loopback_xmit);
87
88                         if (ipv6_hdr(skb)->hop_limit == 0) {
89                                 IP6_INC_STATS(net, idev,
90                                               IPSTATS_MIB_OUTDISCARDS);
91                                 kfree_skb(skb);
92                                 return 0;
93                         }
94                 }
95
96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97
98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99                     IPV6_ADDR_SCOPE_NODELOCAL &&
100                     !(dev->flags & IFF_LOOPBACK)) {
101                         kfree_skb(skb);
102                         return 0;
103                 }
104         }
105
106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107                 int res = lwtunnel_xmit(skb);
108
109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110                         return res;
111         }
112
113         rcu_read_lock_bh();
114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116         if (unlikely(!neigh))
117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118         if (!IS_ERR(neigh)) {
119                 sock_confirm_neigh(skb, neigh);
120                 ret = neigh_output(neigh, skb);
121                 rcu_read_unlock_bh();
122                 return ret;
123         }
124         rcu_read_unlock_bh();
125
126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127         kfree_skb(skb);
128         return -EINVAL;
129 }
130
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133         int ret;
134
135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136         if (ret) {
137                 kfree_skb(skb);
138                 return ret;
139         }
140
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142         /* Policy lookup after SNAT yielded a new policy */
143         if (skb_dst(skb)->xfrm) {
144                 IPCB(skb)->flags |= IPSKB_REROUTED;
145                 return dst_output(net, sk, skb);
146         }
147 #endif
148
149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150             dst_allfrag(skb_dst(skb)) ||
151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
153         else
154                 return ip6_finish_output2(net, sk, skb);
155 }
156
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159         struct net_device *dev = skb_dst(skb)->dev;
160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161
162         skb->protocol = htons(ETH_P_IPV6);
163         skb->dev = dev;
164
165         if (unlikely(idev->cnf.disable_ipv6)) {
166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167                 kfree_skb(skb);
168                 return 0;
169         }
170
171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172                             net, sk, skb, NULL, dev,
173                             ip6_finish_output,
174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179         if (!np->autoflowlabel_set)
180                 return ip6_default_np_autolabel(net);
181         else
182                 return np->autoflowlabel;
183 }
184
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194         struct net *net = sock_net(sk);
195         const struct ipv6_pinfo *np = inet6_sk(sk);
196         struct in6_addr *first_hop = &fl6->daddr;
197         struct dst_entry *dst = skb_dst(skb);
198         struct ipv6hdr *hdr;
199         u8  proto = fl6->flowi6_proto;
200         int seg_len = skb->len;
201         int hlimit = -1;
202         u32 mtu;
203
204         if (opt) {
205                 unsigned int head_room;
206
207                 /* First: exthdrs may take lots of space (~8K for now)
208                    MAX_HEADER is not enough.
209                  */
210                 head_room = opt->opt_nflen + opt->opt_flen;
211                 seg_len += head_room;
212                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213
214                 if (skb_headroom(skb) < head_room) {
215                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216                         if (!skb2) {
217                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
218                                               IPSTATS_MIB_OUTDISCARDS);
219                                 kfree_skb(skb);
220                                 return -ENOBUFS;
221                         }
222                         consume_skb(skb);
223                         skb = skb2;
224                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
225                          * it is safe to call in our context (socket lock not held)
226                          */
227                         skb_set_owner_w(skb, (struct sock *)sk);
228                 }
229                 if (opt->opt_flen)
230                         ipv6_push_frag_opts(skb, opt, &proto);
231                 if (opt->opt_nflen)
232                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
233                                              &fl6->saddr);
234         }
235
236         skb_push(skb, sizeof(struct ipv6hdr));
237         skb_reset_network_header(skb);
238         hdr = ipv6_hdr(skb);
239
240         /*
241          *      Fill in the IPv6 header
242          */
243         if (np)
244                 hlimit = np->hop_limit;
245         if (hlimit < 0)
246                 hlimit = ip6_dst_hoplimit(dst);
247
248         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
249                                 ip6_autoflowlabel(net, np), fl6));
250
251         hdr->payload_len = htons(seg_len);
252         hdr->nexthdr = proto;
253         hdr->hop_limit = hlimit;
254
255         hdr->saddr = fl6->saddr;
256         hdr->daddr = *first_hop;
257
258         skb->protocol = htons(ETH_P_IPV6);
259         skb->priority = sk->sk_priority;
260         skb->mark = mark;
261
262         mtu = dst_mtu(dst);
263         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
264                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
265                               IPSTATS_MIB_OUT, skb->len);
266
267                 /* if egress device is enslaved to an L3 master device pass the
268                  * skb to its handler for processing
269                  */
270                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
271                 if (unlikely(!skb))
272                         return 0;
273
274                 /* hooks should never assume socket lock is held.
275                  * we promote our socket to non const
276                  */
277                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
278                                net, (struct sock *)sk, skb, NULL, dst->dev,
279                                dst_output);
280         }
281
282         skb->dev = dst->dev;
283         /* ipv6_local_error() does not require socket lock,
284          * we promote our socket to non const
285          */
286         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
287
288         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
289         kfree_skb(skb);
290         return -EMSGSIZE;
291 }
292 EXPORT_SYMBOL(ip6_xmit);
293
294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
295 {
296         struct ip6_ra_chain *ra;
297         struct sock *last = NULL;
298
299         read_lock(&ip6_ra_lock);
300         for (ra = ip6_ra_chain; ra; ra = ra->next) {
301                 struct sock *sk = ra->sk;
302                 if (sk && ra->sel == sel &&
303                     (!sk->sk_bound_dev_if ||
304                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
305                         if (last) {
306                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
307                                 if (skb2)
308                                         rawv6_rcv(last, skb2);
309                         }
310                         last = sk;
311                 }
312         }
313
314         if (last) {
315                 rawv6_rcv(last, skb);
316                 read_unlock(&ip6_ra_lock);
317                 return 1;
318         }
319         read_unlock(&ip6_ra_lock);
320         return 0;
321 }
322
323 static int ip6_forward_proxy_check(struct sk_buff *skb)
324 {
325         struct ipv6hdr *hdr = ipv6_hdr(skb);
326         u8 nexthdr = hdr->nexthdr;
327         __be16 frag_off;
328         int offset;
329
330         if (ipv6_ext_hdr(nexthdr)) {
331                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
332                 if (offset < 0)
333                         return 0;
334         } else
335                 offset = sizeof(struct ipv6hdr);
336
337         if (nexthdr == IPPROTO_ICMPV6) {
338                 struct icmp6hdr *icmp6;
339
340                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
341                                          offset + 1 - skb->data)))
342                         return 0;
343
344                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
345
346                 switch (icmp6->icmp6_type) {
347                 case NDISC_ROUTER_SOLICITATION:
348                 case NDISC_ROUTER_ADVERTISEMENT:
349                 case NDISC_NEIGHBOUR_SOLICITATION:
350                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
351                 case NDISC_REDIRECT:
352                         /* For reaction involving unicast neighbor discovery
353                          * message destined to the proxied address, pass it to
354                          * input function.
355                          */
356                         return 1;
357                 default:
358                         break;
359                 }
360         }
361
362         /*
363          * The proxying router can't forward traffic sent to a link-local
364          * address, so signal the sender and discard the packet. This
365          * behavior is clarified by the MIPv6 specification.
366          */
367         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
368                 dst_link_failure(skb);
369                 return -1;
370         }
371
372         return 0;
373 }
374
375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
376                                      struct sk_buff *skb)
377 {
378         return dst_output(net, sk, skb);
379 }
380
381 unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
382 {
383         unsigned int mtu;
384         struct inet6_dev *idev;
385
386         if (dst_metric_locked(dst, RTAX_MTU)) {
387                 mtu = dst_metric_raw(dst, RTAX_MTU);
388                 if (mtu)
389                         return mtu;
390         }
391
392         mtu = IPV6_MIN_MTU;
393         rcu_read_lock();
394         idev = __in6_dev_get(dst->dev);
395         if (idev)
396                 mtu = idev->cnf.mtu6;
397         rcu_read_unlock();
398
399         return mtu;
400 }
401 EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
402
403 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
404 {
405         if (skb->len <= mtu)
406                 return false;
407
408         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
409         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
410                 return true;
411
412         if (skb->ignore_df)
413                 return false;
414
415         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
416                 return false;
417
418         return true;
419 }
420
421 int ip6_forward(struct sk_buff *skb)
422 {
423         struct dst_entry *dst = skb_dst(skb);
424         struct ipv6hdr *hdr = ipv6_hdr(skb);
425         struct inet6_skb_parm *opt = IP6CB(skb);
426         struct net *net = dev_net(dst->dev);
427         u32 mtu;
428
429         if (net->ipv6.devconf_all->forwarding == 0)
430                 goto error;
431
432         if (skb->pkt_type != PACKET_HOST)
433                 goto drop;
434
435         if (unlikely(skb->sk))
436                 goto drop;
437
438         if (skb_warn_if_lro(skb))
439                 goto drop;
440
441         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
442                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
443                                 IPSTATS_MIB_INDISCARDS);
444                 goto drop;
445         }
446
447         skb_forward_csum(skb);
448
449         /*
450          *      We DO NOT make any processing on
451          *      RA packets, pushing them to user level AS IS
452          *      without ane WARRANTY that application will be able
453          *      to interpret them. The reason is that we
454          *      cannot make anything clever here.
455          *
456          *      We are not end-node, so that if packet contains
457          *      AH/ESP, we cannot make anything.
458          *      Defragmentation also would be mistake, RA packets
459          *      cannot be fragmented, because there is no warranty
460          *      that different fragments will go along one path. --ANK
461          */
462         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
463                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
464                         return 0;
465         }
466
467         /*
468          *      check and decrement ttl
469          */
470         if (hdr->hop_limit <= 1) {
471                 /* Force OUTPUT device used as source address */
472                 skb->dev = dst->dev;
473                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
474                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
475                                 IPSTATS_MIB_INHDRERRORS);
476
477                 kfree_skb(skb);
478                 return -ETIMEDOUT;
479         }
480
481         /* XXX: idev->cnf.proxy_ndp? */
482         if (net->ipv6.devconf_all->proxy_ndp &&
483             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
484                 int proxied = ip6_forward_proxy_check(skb);
485                 if (proxied > 0)
486                         return ip6_input(skb);
487                 else if (proxied < 0) {
488                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
489                                         IPSTATS_MIB_INDISCARDS);
490                         goto drop;
491                 }
492         }
493
494         if (!xfrm6_route_forward(skb)) {
495                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
496                                 IPSTATS_MIB_INDISCARDS);
497                 goto drop;
498         }
499         dst = skb_dst(skb);
500
501         /* IPv6 specs say nothing about it, but it is clear that we cannot
502            send redirects to source routed frames.
503            We don't send redirects to frames decapsulated from IPsec.
504          */
505         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
506                 struct in6_addr *target = NULL;
507                 struct inet_peer *peer;
508                 struct rt6_info *rt;
509
510                 /*
511                  *      incoming and outgoing devices are the same
512                  *      send a redirect.
513                  */
514
515                 rt = (struct rt6_info *) dst;
516                 if (rt->rt6i_flags & RTF_GATEWAY)
517                         target = &rt->rt6i_gateway;
518                 else
519                         target = &hdr->daddr;
520
521                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
522
523                 /* Limit redirects both by destination (here)
524                    and by source (inside ndisc_send_redirect)
525                  */
526                 if (inet_peer_xrlim_allow(peer, 1*HZ))
527                         ndisc_send_redirect(skb, target);
528                 if (peer)
529                         inet_putpeer(peer);
530         } else {
531                 int addrtype = ipv6_addr_type(&hdr->saddr);
532
533                 /* This check is security critical. */
534                 if (addrtype == IPV6_ADDR_ANY ||
535                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
536                         goto error;
537                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
538                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
539                                     ICMPV6_NOT_NEIGHBOUR, 0);
540                         goto error;
541                 }
542         }
543
544         mtu = ip6_dst_mtu_forward(dst);
545         if (mtu < IPV6_MIN_MTU)
546                 mtu = IPV6_MIN_MTU;
547
548         if (ip6_pkt_too_big(skb, mtu)) {
549                 /* Again, force OUTPUT device used as source address */
550                 skb->dev = dst->dev;
551                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
552                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
553                                 IPSTATS_MIB_INTOOBIGERRORS);
554                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
555                                 IPSTATS_MIB_FRAGFAILS);
556                 kfree_skb(skb);
557                 return -EMSGSIZE;
558         }
559
560         if (skb_cow(skb, dst->dev->hard_header_len)) {
561                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
562                                 IPSTATS_MIB_OUTDISCARDS);
563                 goto drop;
564         }
565
566         hdr = ipv6_hdr(skb);
567
568         /* Mangling hops number delayed to point after skb COW */
569
570         hdr->hop_limit--;
571
572         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
573         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
574         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
575                        net, NULL, skb, skb->dev, dst->dev,
576                        ip6_forward_finish);
577
578 error:
579         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
580 drop:
581         kfree_skb(skb);
582         return -EINVAL;
583 }
584
585 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
586 {
587         to->pkt_type = from->pkt_type;
588         to->priority = from->priority;
589         to->protocol = from->protocol;
590         skb_dst_drop(to);
591         skb_dst_set(to, dst_clone(skb_dst(from)));
592         to->dev = from->dev;
593         to->mark = from->mark;
594
595 #ifdef CONFIG_NET_SCHED
596         to->tc_index = from->tc_index;
597 #endif
598         nf_copy(to, from);
599         skb_copy_secmark(to, from);
600 }
601
602 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
603                  int (*output)(struct net *, struct sock *, struct sk_buff *))
604 {
605         struct sk_buff *frag;
606         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
607         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
608                                 inet6_sk(skb->sk) : NULL;
609         struct ipv6hdr *tmp_hdr;
610         struct frag_hdr *fh;
611         unsigned int mtu, hlen, left, len;
612         int hroom, troom;
613         __be32 frag_id;
614         int ptr, offset = 0, err = 0;
615         u8 *prevhdr, nexthdr = 0;
616
617         err = ip6_find_1stfragopt(skb, &prevhdr);
618         if (err < 0)
619                 goto fail;
620         hlen = err;
621         nexthdr = *prevhdr;
622
623         mtu = ip6_skb_dst_mtu(skb);
624
625         /* We must not fragment if the socket is set to force MTU discovery
626          * or if the skb it not generated by a local socket.
627          */
628         if (unlikely(!skb->ignore_df && skb->len > mtu))
629                 goto fail_toobig;
630
631         if (IP6CB(skb)->frag_max_size) {
632                 if (IP6CB(skb)->frag_max_size > mtu)
633                         goto fail_toobig;
634
635                 /* don't send fragments larger than what we received */
636                 mtu = IP6CB(skb)->frag_max_size;
637                 if (mtu < IPV6_MIN_MTU)
638                         mtu = IPV6_MIN_MTU;
639         }
640
641         if (np && np->frag_size < mtu) {
642                 if (np->frag_size)
643                         mtu = np->frag_size;
644         }
645         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
646                 goto fail_toobig;
647         mtu -= hlen + sizeof(struct frag_hdr);
648
649         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
650                                     &ipv6_hdr(skb)->saddr);
651
652         if (skb->ip_summed == CHECKSUM_PARTIAL &&
653             (err = skb_checksum_help(skb)))
654                 goto fail;
655
656         hroom = LL_RESERVED_SPACE(rt->dst.dev);
657         if (skb_has_frag_list(skb)) {
658                 unsigned int first_len = skb_pagelen(skb);
659                 struct sk_buff *frag2;
660
661                 if (first_len - hlen > mtu ||
662                     ((first_len - hlen) & 7) ||
663                     skb_cloned(skb) ||
664                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
665                         goto slow_path;
666
667                 skb_walk_frags(skb, frag) {
668                         /* Correct geometry. */
669                         if (frag->len > mtu ||
670                             ((frag->len & 7) && frag->next) ||
671                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
672                                 goto slow_path_clean;
673
674                         /* Partially cloned skb? */
675                         if (skb_shared(frag))
676                                 goto slow_path_clean;
677
678                         BUG_ON(frag->sk);
679                         if (skb->sk) {
680                                 frag->sk = skb->sk;
681                                 frag->destructor = sock_wfree;
682                         }
683                         skb->truesize -= frag->truesize;
684                 }
685
686                 err = 0;
687                 offset = 0;
688                 /* BUILD HEADER */
689
690                 *prevhdr = NEXTHDR_FRAGMENT;
691                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
692                 if (!tmp_hdr) {
693                         err = -ENOMEM;
694                         goto fail;
695                 }
696                 frag = skb_shinfo(skb)->frag_list;
697                 skb_frag_list_init(skb);
698
699                 __skb_pull(skb, hlen);
700                 fh = __skb_push(skb, sizeof(struct frag_hdr));
701                 __skb_push(skb, hlen);
702                 skb_reset_network_header(skb);
703                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
704
705                 fh->nexthdr = nexthdr;
706                 fh->reserved = 0;
707                 fh->frag_off = htons(IP6_MF);
708                 fh->identification = frag_id;
709
710                 first_len = skb_pagelen(skb);
711                 skb->data_len = first_len - skb_headlen(skb);
712                 skb->len = first_len;
713                 ipv6_hdr(skb)->payload_len = htons(first_len -
714                                                    sizeof(struct ipv6hdr));
715
716                 for (;;) {
717                         /* Prepare header of the next frame,
718                          * before previous one went down. */
719                         if (frag) {
720                                 frag->ip_summed = CHECKSUM_NONE;
721                                 skb_reset_transport_header(frag);
722                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
723                                 __skb_push(frag, hlen);
724                                 skb_reset_network_header(frag);
725                                 memcpy(skb_network_header(frag), tmp_hdr,
726                                        hlen);
727                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
728                                 fh->nexthdr = nexthdr;
729                                 fh->reserved = 0;
730                                 fh->frag_off = htons(offset);
731                                 if (frag->next)
732                                         fh->frag_off |= htons(IP6_MF);
733                                 fh->identification = frag_id;
734                                 ipv6_hdr(frag)->payload_len =
735                                                 htons(frag->len -
736                                                       sizeof(struct ipv6hdr));
737                                 ip6_copy_metadata(frag, skb);
738                         }
739
740                         err = output(net, sk, skb);
741                         if (!err)
742                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
743                                               IPSTATS_MIB_FRAGCREATES);
744
745                         if (err || !frag)
746                                 break;
747
748                         skb = frag;
749                         frag = skb->next;
750                         skb->next = NULL;
751                 }
752
753                 kfree(tmp_hdr);
754
755                 if (err == 0) {
756                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
757                                       IPSTATS_MIB_FRAGOKS);
758                         return 0;
759                 }
760
761                 kfree_skb_list(frag);
762
763                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
764                               IPSTATS_MIB_FRAGFAILS);
765                 return err;
766
767 slow_path_clean:
768                 skb_walk_frags(skb, frag2) {
769                         if (frag2 == frag)
770                                 break;
771                         frag2->sk = NULL;
772                         frag2->destructor = NULL;
773                         skb->truesize += frag2->truesize;
774                 }
775         }
776
777 slow_path:
778         left = skb->len - hlen;         /* Space per frame */
779         ptr = hlen;                     /* Where to start from */
780
781         /*
782          *      Fragment the datagram.
783          */
784
785         troom = rt->dst.dev->needed_tailroom;
786
787         /*
788          *      Keep copying data until we run out.
789          */
790         while (left > 0)        {
791                 u8 *fragnexthdr_offset;
792
793                 len = left;
794                 /* IF: it doesn't fit, use 'mtu' - the data space left */
795                 if (len > mtu)
796                         len = mtu;
797                 /* IF: we are not sending up to and including the packet end
798                    then align the next start on an eight byte boundary */
799                 if (len < left) {
800                         len &= ~7;
801                 }
802
803                 /* Allocate buffer */
804                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
805                                  hroom + troom, GFP_ATOMIC);
806                 if (!frag) {
807                         err = -ENOMEM;
808                         goto fail;
809                 }
810
811                 /*
812                  *      Set up data on packet
813                  */
814
815                 ip6_copy_metadata(frag, skb);
816                 skb_reserve(frag, hroom);
817                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
818                 skb_reset_network_header(frag);
819                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
820                 frag->transport_header = (frag->network_header + hlen +
821                                           sizeof(struct frag_hdr));
822
823                 /*
824                  *      Charge the memory for the fragment to any owner
825                  *      it might possess
826                  */
827                 if (skb->sk)
828                         skb_set_owner_w(frag, skb->sk);
829
830                 /*
831                  *      Copy the packet header into the new buffer.
832                  */
833                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
834
835                 fragnexthdr_offset = skb_network_header(frag);
836                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
837                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
838
839                 /*
840                  *      Build fragment header.
841                  */
842                 fh->nexthdr = nexthdr;
843                 fh->reserved = 0;
844                 fh->identification = frag_id;
845
846                 /*
847                  *      Copy a block of the IP datagram.
848                  */
849                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
850                                      len));
851                 left -= len;
852
853                 fh->frag_off = htons(offset);
854                 if (left > 0)
855                         fh->frag_off |= htons(IP6_MF);
856                 ipv6_hdr(frag)->payload_len = htons(frag->len -
857                                                     sizeof(struct ipv6hdr));
858
859                 ptr += len;
860                 offset += len;
861
862                 /*
863                  *      Put this fragment into the sending queue.
864                  */
865                 err = output(net, sk, frag);
866                 if (err)
867                         goto fail;
868
869                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
870                               IPSTATS_MIB_FRAGCREATES);
871         }
872         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
873                       IPSTATS_MIB_FRAGOKS);
874         consume_skb(skb);
875         return err;
876
877 fail_toobig:
878         if (skb->sk && dst_allfrag(skb_dst(skb)))
879                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
880
881         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
882         err = -EMSGSIZE;
883
884 fail:
885         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
886                       IPSTATS_MIB_FRAGFAILS);
887         kfree_skb(skb);
888         return err;
889 }
890
891 static inline int ip6_rt_check(const struct rt6key *rt_key,
892                                const struct in6_addr *fl_addr,
893                                const struct in6_addr *addr_cache)
894 {
895         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
896                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
897 }
898
899 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
900                                           struct dst_entry *dst,
901                                           const struct flowi6 *fl6)
902 {
903         struct ipv6_pinfo *np = inet6_sk(sk);
904         struct rt6_info *rt;
905
906         if (!dst)
907                 goto out;
908
909         if (dst->ops->family != AF_INET6) {
910                 dst_release(dst);
911                 return NULL;
912         }
913
914         rt = (struct rt6_info *)dst;
915         /* Yes, checking route validity in not connected
916          * case is not very simple. Take into account,
917          * that we do not support routing by source, TOS,
918          * and MSG_DONTROUTE            --ANK (980726)
919          *
920          * 1. ip6_rt_check(): If route was host route,
921          *    check that cached destination is current.
922          *    If it is network route, we still may
923          *    check its validity using saved pointer
924          *    to the last used address: daddr_cache.
925          *    We do not want to save whole address now,
926          *    (because main consumer of this service
927          *    is tcp, which has not this problem),
928          *    so that the last trick works only on connected
929          *    sockets.
930          * 2. oif also should be the same.
931          */
932         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
933 #ifdef CONFIG_IPV6_SUBTREES
934             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
935 #endif
936            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
937               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
938                 dst_release(dst);
939                 dst = NULL;
940         }
941
942 out:
943         return dst;
944 }
945
946 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
947                                struct dst_entry **dst, struct flowi6 *fl6)
948 {
949 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
950         struct neighbour *n;
951         struct rt6_info *rt;
952 #endif
953         int err;
954         int flags = 0;
955
956         /* The correct way to handle this would be to do
957          * ip6_route_get_saddr, and then ip6_route_output; however,
958          * the route-specific preferred source forces the
959          * ip6_route_output call _before_ ip6_route_get_saddr.
960          *
961          * In source specific routing (no src=any default route),
962          * ip6_route_output will fail given src=any saddr, though, so
963          * that's why we try it again later.
964          */
965         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
966                 struct rt6_info *rt;
967                 bool had_dst = *dst != NULL;
968
969                 if (!had_dst)
970                         *dst = ip6_route_output(net, sk, fl6);
971                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
972                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
973                                           sk ? inet6_sk(sk)->srcprefs : 0,
974                                           &fl6->saddr);
975                 if (err)
976                         goto out_err_release;
977
978                 /* If we had an erroneous initial result, pretend it
979                  * never existed and let the SA-enabled version take
980                  * over.
981                  */
982                 if (!had_dst && (*dst)->error) {
983                         dst_release(*dst);
984                         *dst = NULL;
985                 }
986
987                 if (fl6->flowi6_oif)
988                         flags |= RT6_LOOKUP_F_IFACE;
989         }
990
991         if (!*dst)
992                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
993
994         err = (*dst)->error;
995         if (err)
996                 goto out_err_release;
997
998 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
999         /*
1000          * Here if the dst entry we've looked up
1001          * has a neighbour entry that is in the INCOMPLETE
1002          * state and the src address from the flow is
1003          * marked as OPTIMISTIC, we release the found
1004          * dst entry and replace it instead with the
1005          * dst entry of the nexthop router
1006          */
1007         rt = (struct rt6_info *) *dst;
1008         rcu_read_lock_bh();
1009         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1010                                       rt6_nexthop(rt, &fl6->daddr));
1011         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1012         rcu_read_unlock_bh();
1013
1014         if (err) {
1015                 struct inet6_ifaddr *ifp;
1016                 struct flowi6 fl_gw6;
1017                 int redirect;
1018
1019                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1020                                       (*dst)->dev, 1);
1021
1022                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1023                 if (ifp)
1024                         in6_ifa_put(ifp);
1025
1026                 if (redirect) {
1027                         /*
1028                          * We need to get the dst entry for the
1029                          * default router instead
1030                          */
1031                         dst_release(*dst);
1032                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1033                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1034                         *dst = ip6_route_output(net, sk, &fl_gw6);
1035                         err = (*dst)->error;
1036                         if (err)
1037                                 goto out_err_release;
1038                 }
1039         }
1040 #endif
1041         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1042             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1043                 err = -EAFNOSUPPORT;
1044                 goto out_err_release;
1045         }
1046
1047         return 0;
1048
1049 out_err_release:
1050         dst_release(*dst);
1051         *dst = NULL;
1052
1053         if (err == -ENETUNREACH)
1054                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1055         return err;
1056 }
1057
1058 /**
1059  *      ip6_dst_lookup - perform route lookup on flow
1060  *      @sk: socket which provides route info
1061  *      @dst: pointer to dst_entry * for result
1062  *      @fl6: flow to lookup
1063  *
1064  *      This function performs a route lookup on the given flow.
1065  *
1066  *      It returns zero on success, or a standard errno code on error.
1067  */
1068 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1069                    struct flowi6 *fl6)
1070 {
1071         *dst = NULL;
1072         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1073 }
1074 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1075
1076 /**
1077  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1078  *      @sk: socket which provides route info
1079  *      @fl6: flow to lookup
1080  *      @final_dst: final destination address for ipsec lookup
1081  *
1082  *      This function performs a route lookup on the given flow.
1083  *
1084  *      It returns a valid dst pointer on success, or a pointer encoded
1085  *      error code.
1086  */
1087 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1088                                       const struct in6_addr *final_dst)
1089 {
1090         struct dst_entry *dst = NULL;
1091         int err;
1092
1093         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1094         if (err)
1095                 return ERR_PTR(err);
1096         if (final_dst)
1097                 fl6->daddr = *final_dst;
1098
1099         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1100 }
1101 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1102
1103 /**
1104  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1105  *      @sk: socket which provides the dst cache and route info
1106  *      @fl6: flow to lookup
1107  *      @final_dst: final destination address for ipsec lookup
1108  *
1109  *      This function performs a route lookup on the given flow with the
1110  *      possibility of using the cached route in the socket if it is valid.
1111  *      It will take the socket dst lock when operating on the dst cache.
1112  *      As a result, this function can only be used in process context.
1113  *
1114  *      It returns a valid dst pointer on success, or a pointer encoded
1115  *      error code.
1116  */
1117 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1118                                          const struct in6_addr *final_dst)
1119 {
1120         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1121
1122         dst = ip6_sk_dst_check(sk, dst, fl6);
1123         if (!dst)
1124                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1125
1126         return dst;
1127 }
1128 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1129
1130 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1131                                                gfp_t gfp)
1132 {
1133         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1134 }
1135
1136 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1137                                                 gfp_t gfp)
1138 {
1139         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1140 }
1141
1142 static void ip6_append_data_mtu(unsigned int *mtu,
1143                                 int *maxfraglen,
1144                                 unsigned int fragheaderlen,
1145                                 struct sk_buff *skb,
1146                                 struct rt6_info *rt,
1147                                 unsigned int orig_mtu)
1148 {
1149         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1150                 if (!skb) {
1151                         /* first fragment, reserve header_len */
1152                         *mtu = orig_mtu - rt->dst.header_len;
1153
1154                 } else {
1155                         /*
1156                          * this fragment is not first, the headers
1157                          * space is regarded as data space.
1158                          */
1159                         *mtu = orig_mtu;
1160                 }
1161                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1162                               + fragheaderlen - sizeof(struct frag_hdr);
1163         }
1164 }
1165
1166 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1167                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1168                           struct rt6_info *rt, struct flowi6 *fl6)
1169 {
1170         struct ipv6_pinfo *np = inet6_sk(sk);
1171         unsigned int mtu;
1172         struct ipv6_txoptions *opt = ipc6->opt;
1173
1174         /*
1175          * setup for corking
1176          */
1177         if (opt) {
1178                 if (WARN_ON(v6_cork->opt))
1179                         return -EINVAL;
1180
1181                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1182                 if (unlikely(!v6_cork->opt))
1183                         return -ENOBUFS;
1184
1185                 v6_cork->opt->tot_len = sizeof(*opt);
1186                 v6_cork->opt->opt_flen = opt->opt_flen;
1187                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1188
1189                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1190                                                     sk->sk_allocation);
1191                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1192                         return -ENOBUFS;
1193
1194                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1195                                                     sk->sk_allocation);
1196                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1197                         return -ENOBUFS;
1198
1199                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1200                                                    sk->sk_allocation);
1201                 if (opt->hopopt && !v6_cork->opt->hopopt)
1202                         return -ENOBUFS;
1203
1204                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1205                                                     sk->sk_allocation);
1206                 if (opt->srcrt && !v6_cork->opt->srcrt)
1207                         return -ENOBUFS;
1208
1209                 /* need source address above miyazawa*/
1210         }
1211         dst_hold(&rt->dst);
1212         cork->base.dst = &rt->dst;
1213         cork->fl.u.ip6 = *fl6;
1214         v6_cork->hop_limit = ipc6->hlimit;
1215         v6_cork->tclass = ipc6->tclass;
1216         if (rt->dst.flags & DST_XFRM_TUNNEL)
1217                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1218                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1219         else
1220                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1221                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1222         if (np->frag_size < mtu) {
1223                 if (np->frag_size)
1224                         mtu = np->frag_size;
1225         }
1226         if (mtu < IPV6_MIN_MTU)
1227                 return -EINVAL;
1228         cork->base.fragsize = mtu;
1229         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1230                 cork->base.flags |= IPCORK_ALLFRAG;
1231         cork->base.length = 0;
1232
1233         return 0;
1234 }
1235
1236 static int __ip6_append_data(struct sock *sk,
1237                              struct flowi6 *fl6,
1238                              struct sk_buff_head *queue,
1239                              struct inet_cork *cork,
1240                              struct inet6_cork *v6_cork,
1241                              struct page_frag *pfrag,
1242                              int getfrag(void *from, char *to, int offset,
1243                                          int len, int odd, struct sk_buff *skb),
1244                              void *from, int length, int transhdrlen,
1245                              unsigned int flags, struct ipcm6_cookie *ipc6,
1246                              const struct sockcm_cookie *sockc)
1247 {
1248         struct sk_buff *skb, *skb_prev = NULL;
1249         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1250         int exthdrlen = 0;
1251         int dst_exthdrlen = 0;
1252         int hh_len;
1253         int copy;
1254         int err;
1255         int offset = 0;
1256         __u8 tx_flags = 0;
1257         u32 tskey = 0;
1258         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1259         struct ipv6_txoptions *opt = v6_cork->opt;
1260         int csummode = CHECKSUM_NONE;
1261         unsigned int maxnonfragsize, headersize;
1262
1263         skb = skb_peek_tail(queue);
1264         if (!skb) {
1265                 exthdrlen = opt ? opt->opt_flen : 0;
1266                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1267         }
1268
1269         mtu = cork->fragsize;
1270         orig_mtu = mtu;
1271
1272         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1273
1274         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1275                         (opt ? opt->opt_nflen : 0);
1276         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1277                      sizeof(struct frag_hdr);
1278
1279         headersize = sizeof(struct ipv6hdr) +
1280                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1281                      (dst_allfrag(&rt->dst) ?
1282                       sizeof(struct frag_hdr) : 0) +
1283                      rt->rt6i_nfheader_len;
1284
1285         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1286          * the first fragment
1287          */
1288         if (headersize + transhdrlen > mtu)
1289                 goto emsgsize;
1290
1291         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1292             (sk->sk_protocol == IPPROTO_UDP ||
1293              sk->sk_protocol == IPPROTO_RAW)) {
1294                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1295                                 sizeof(struct ipv6hdr));
1296                 goto emsgsize;
1297         }
1298
1299         if (ip6_sk_ignore_df(sk))
1300                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1301         else
1302                 maxnonfragsize = mtu;
1303
1304         if (cork->length + length > maxnonfragsize - headersize) {
1305 emsgsize:
1306                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1307                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1308                 return -EMSGSIZE;
1309         }
1310
1311         /* CHECKSUM_PARTIAL only with no extension headers and when
1312          * we are not going to fragment
1313          */
1314         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1315             headersize == sizeof(struct ipv6hdr) &&
1316             length <= mtu - headersize &&
1317             !(flags & MSG_MORE) &&
1318             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1319                 csummode = CHECKSUM_PARTIAL;
1320
1321         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1322                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1323                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1324                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1325                         tskey = sk->sk_tskey++;
1326         }
1327
1328         /*
1329          * Let's try using as much space as possible.
1330          * Use MTU if total length of the message fits into the MTU.
1331          * Otherwise, we need to reserve fragment header and
1332          * fragment alignment (= 8-15 octects, in total).
1333          *
1334          * Note that we may need to "move" the data from the tail of
1335          * of the buffer to the new fragment when we split
1336          * the message.
1337          *
1338          * FIXME: It may be fragmented into multiple chunks
1339          *        at once if non-fragmentable extension headers
1340          *        are too large.
1341          * --yoshfuji
1342          */
1343
1344         cork->length += length;
1345         if (!skb)
1346                 goto alloc_new_skb;
1347
1348         while (length > 0) {
1349                 /* Check if the remaining data fits into current packet. */
1350                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1351                 if (copy < length)
1352                         copy = maxfraglen - skb->len;
1353
1354                 if (copy <= 0) {
1355                         char *data;
1356                         unsigned int datalen;
1357                         unsigned int fraglen;
1358                         unsigned int fraggap;
1359                         unsigned int alloclen;
1360 alloc_new_skb:
1361                         /* There's no room in the current skb */
1362                         if (skb)
1363                                 fraggap = skb->len - maxfraglen;
1364                         else
1365                                 fraggap = 0;
1366                         /* update mtu and maxfraglen if necessary */
1367                         if (!skb || !skb_prev)
1368                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1369                                                     fragheaderlen, skb, rt,
1370                                                     orig_mtu);
1371
1372                         skb_prev = skb;
1373
1374                         /*
1375                          * If remaining data exceeds the mtu,
1376                          * we know we need more fragment(s).
1377                          */
1378                         datalen = length + fraggap;
1379
1380                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1381                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1382                         if ((flags & MSG_MORE) &&
1383                             !(rt->dst.dev->features&NETIF_F_SG))
1384                                 alloclen = mtu;
1385                         else
1386                                 alloclen = datalen + fragheaderlen;
1387
1388                         alloclen += dst_exthdrlen;
1389
1390                         if (datalen != length + fraggap) {
1391                                 /*
1392                                  * this is not the last fragment, the trailer
1393                                  * space is regarded as data space.
1394                                  */
1395                                 datalen += rt->dst.trailer_len;
1396                         }
1397
1398                         alloclen += rt->dst.trailer_len;
1399                         fraglen = datalen + fragheaderlen;
1400
1401                         /*
1402                          * We just reserve space for fragment header.
1403                          * Note: this may be overallocation if the message
1404                          * (without MSG_MORE) fits into the MTU.
1405                          */
1406                         alloclen += sizeof(struct frag_hdr);
1407
1408                         copy = datalen - transhdrlen - fraggap;
1409                         if (copy < 0) {
1410                                 err = -EINVAL;
1411                                 goto error;
1412                         }
1413                         if (transhdrlen) {
1414                                 skb = sock_alloc_send_skb(sk,
1415                                                 alloclen + hh_len,
1416                                                 (flags & MSG_DONTWAIT), &err);
1417                         } else {
1418                                 skb = NULL;
1419                                 if (refcount_read(&sk->sk_wmem_alloc) <=
1420                                     2 * sk->sk_sndbuf)
1421                                         skb = sock_wmalloc(sk,
1422                                                            alloclen + hh_len, 1,
1423                                                            sk->sk_allocation);
1424                                 if (unlikely(!skb))
1425                                         err = -ENOBUFS;
1426                         }
1427                         if (!skb)
1428                                 goto error;
1429                         /*
1430                          *      Fill in the control structures
1431                          */
1432                         skb->protocol = htons(ETH_P_IPV6);
1433                         skb->ip_summed = csummode;
1434                         skb->csum = 0;
1435                         /* reserve for fragmentation and ipsec header */
1436                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1437                                     dst_exthdrlen);
1438
1439                         /* Only the initial fragment is time stamped */
1440                         skb_shinfo(skb)->tx_flags = tx_flags;
1441                         tx_flags = 0;
1442                         skb_shinfo(skb)->tskey = tskey;
1443                         tskey = 0;
1444
1445                         /*
1446                          *      Find where to start putting bytes
1447                          */
1448                         data = skb_put(skb, fraglen);
1449                         skb_set_network_header(skb, exthdrlen);
1450                         data += fragheaderlen;
1451                         skb->transport_header = (skb->network_header +
1452                                                  fragheaderlen);
1453                         if (fraggap) {
1454                                 skb->csum = skb_copy_and_csum_bits(
1455                                         skb_prev, maxfraglen,
1456                                         data + transhdrlen, fraggap, 0);
1457                                 skb_prev->csum = csum_sub(skb_prev->csum,
1458                                                           skb->csum);
1459                                 data += fraggap;
1460                                 pskb_trim_unique(skb_prev, maxfraglen);
1461                         }
1462                         if (copy > 0 &&
1463                             getfrag(from, data + transhdrlen, offset,
1464                                     copy, fraggap, skb) < 0) {
1465                                 err = -EFAULT;
1466                                 kfree_skb(skb);
1467                                 goto error;
1468                         }
1469
1470                         offset += copy;
1471                         length -= datalen - fraggap;
1472                         transhdrlen = 0;
1473                         exthdrlen = 0;
1474                         dst_exthdrlen = 0;
1475
1476                         if ((flags & MSG_CONFIRM) && !skb_prev)
1477                                 skb_set_dst_pending_confirm(skb, 1);
1478
1479                         /*
1480                          * Put the packet on the pending queue
1481                          */
1482                         __skb_queue_tail(queue, skb);
1483                         continue;
1484                 }
1485
1486                 if (copy > length)
1487                         copy = length;
1488
1489                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1490                         unsigned int off;
1491
1492                         off = skb->len;
1493                         if (getfrag(from, skb_put(skb, copy),
1494                                                 offset, copy, off, skb) < 0) {
1495                                 __skb_trim(skb, off);
1496                                 err = -EFAULT;
1497                                 goto error;
1498                         }
1499                 } else {
1500                         int i = skb_shinfo(skb)->nr_frags;
1501
1502                         err = -ENOMEM;
1503                         if (!sk_page_frag_refill(sk, pfrag))
1504                                 goto error;
1505
1506                         if (!skb_can_coalesce(skb, i, pfrag->page,
1507                                               pfrag->offset)) {
1508                                 err = -EMSGSIZE;
1509                                 if (i == MAX_SKB_FRAGS)
1510                                         goto error;
1511
1512                                 __skb_fill_page_desc(skb, i, pfrag->page,
1513                                                      pfrag->offset, 0);
1514                                 skb_shinfo(skb)->nr_frags = ++i;
1515                                 get_page(pfrag->page);
1516                         }
1517                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1518                         if (getfrag(from,
1519                                     page_address(pfrag->page) + pfrag->offset,
1520                                     offset, copy, skb->len, skb) < 0)
1521                                 goto error_efault;
1522
1523                         pfrag->offset += copy;
1524                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1525                         skb->len += copy;
1526                         skb->data_len += copy;
1527                         skb->truesize += copy;
1528                         refcount_add(copy, &sk->sk_wmem_alloc);
1529                 }
1530                 offset += copy;
1531                 length -= copy;
1532         }
1533
1534         return 0;
1535
1536 error_efault:
1537         err = -EFAULT;
1538 error:
1539         cork->length -= length;
1540         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1541         return err;
1542 }
1543
1544 int ip6_append_data(struct sock *sk,
1545                     int getfrag(void *from, char *to, int offset, int len,
1546                                 int odd, struct sk_buff *skb),
1547                     void *from, int length, int transhdrlen,
1548                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1549                     struct rt6_info *rt, unsigned int flags,
1550                     const struct sockcm_cookie *sockc)
1551 {
1552         struct inet_sock *inet = inet_sk(sk);
1553         struct ipv6_pinfo *np = inet6_sk(sk);
1554         int exthdrlen;
1555         int err;
1556
1557         if (flags&MSG_PROBE)
1558                 return 0;
1559         if (skb_queue_empty(&sk->sk_write_queue)) {
1560                 /*
1561                  * setup for corking
1562                  */
1563                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1564                                      ipc6, rt, fl6);
1565                 if (err)
1566                         return err;
1567
1568                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1569                 length += exthdrlen;
1570                 transhdrlen += exthdrlen;
1571         } else {
1572                 fl6 = &inet->cork.fl.u.ip6;
1573                 transhdrlen = 0;
1574         }
1575
1576         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1577                                  &np->cork, sk_page_frag(sk), getfrag,
1578                                  from, length, transhdrlen, flags, ipc6, sockc);
1579 }
1580 EXPORT_SYMBOL_GPL(ip6_append_data);
1581
1582 static void ip6_cork_release(struct inet_cork_full *cork,
1583                              struct inet6_cork *v6_cork)
1584 {
1585         if (v6_cork->opt) {
1586                 kfree(v6_cork->opt->dst0opt);
1587                 kfree(v6_cork->opt->dst1opt);
1588                 kfree(v6_cork->opt->hopopt);
1589                 kfree(v6_cork->opt->srcrt);
1590                 kfree(v6_cork->opt);
1591                 v6_cork->opt = NULL;
1592         }
1593
1594         if (cork->base.dst) {
1595                 dst_release(cork->base.dst);
1596                 cork->base.dst = NULL;
1597                 cork->base.flags &= ~IPCORK_ALLFRAG;
1598         }
1599         memset(&cork->fl, 0, sizeof(cork->fl));
1600 }
1601
1602 struct sk_buff *__ip6_make_skb(struct sock *sk,
1603                                struct sk_buff_head *queue,
1604                                struct inet_cork_full *cork,
1605                                struct inet6_cork *v6_cork)
1606 {
1607         struct sk_buff *skb, *tmp_skb;
1608         struct sk_buff **tail_skb;
1609         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1610         struct ipv6_pinfo *np = inet6_sk(sk);
1611         struct net *net = sock_net(sk);
1612         struct ipv6hdr *hdr;
1613         struct ipv6_txoptions *opt = v6_cork->opt;
1614         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1615         struct flowi6 *fl6 = &cork->fl.u.ip6;
1616         unsigned char proto = fl6->flowi6_proto;
1617
1618         skb = __skb_dequeue(queue);
1619         if (!skb)
1620                 goto out;
1621         tail_skb = &(skb_shinfo(skb)->frag_list);
1622
1623         /* move skb->data to ip header from ext header */
1624         if (skb->data < skb_network_header(skb))
1625                 __skb_pull(skb, skb_network_offset(skb));
1626         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1627                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1628                 *tail_skb = tmp_skb;
1629                 tail_skb = &(tmp_skb->next);
1630                 skb->len += tmp_skb->len;
1631                 skb->data_len += tmp_skb->len;
1632                 skb->truesize += tmp_skb->truesize;
1633                 tmp_skb->destructor = NULL;
1634                 tmp_skb->sk = NULL;
1635         }
1636
1637         /* Allow local fragmentation. */
1638         skb->ignore_df = ip6_sk_ignore_df(sk);
1639
1640         *final_dst = fl6->daddr;
1641         __skb_pull(skb, skb_network_header_len(skb));
1642         if (opt && opt->opt_flen)
1643                 ipv6_push_frag_opts(skb, opt, &proto);
1644         if (opt && opt->opt_nflen)
1645                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1646
1647         skb_push(skb, sizeof(struct ipv6hdr));
1648         skb_reset_network_header(skb);
1649         hdr = ipv6_hdr(skb);
1650
1651         ip6_flow_hdr(hdr, v6_cork->tclass,
1652                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1653                                         ip6_autoflowlabel(net, np), fl6));
1654         hdr->hop_limit = v6_cork->hop_limit;
1655         hdr->nexthdr = proto;
1656         hdr->saddr = fl6->saddr;
1657         hdr->daddr = *final_dst;
1658
1659         skb->priority = sk->sk_priority;
1660         skb->mark = sk->sk_mark;
1661
1662         skb_dst_set(skb, dst_clone(&rt->dst));
1663         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1664         if (proto == IPPROTO_ICMPV6) {
1665                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1666
1667                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1668                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1669         }
1670
1671         ip6_cork_release(cork, v6_cork);
1672 out:
1673         return skb;
1674 }
1675
1676 int ip6_send_skb(struct sk_buff *skb)
1677 {
1678         struct net *net = sock_net(skb->sk);
1679         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1680         int err;
1681
1682         err = ip6_local_out(net, skb->sk, skb);
1683         if (err) {
1684                 if (err > 0)
1685                         err = net_xmit_errno(err);
1686                 if (err)
1687                         IP6_INC_STATS(net, rt->rt6i_idev,
1688                                       IPSTATS_MIB_OUTDISCARDS);
1689         }
1690
1691         return err;
1692 }
1693
1694 int ip6_push_pending_frames(struct sock *sk)
1695 {
1696         struct sk_buff *skb;
1697
1698         skb = ip6_finish_skb(sk);
1699         if (!skb)
1700                 return 0;
1701
1702         return ip6_send_skb(skb);
1703 }
1704 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1705
1706 static void __ip6_flush_pending_frames(struct sock *sk,
1707                                        struct sk_buff_head *queue,
1708                                        struct inet_cork_full *cork,
1709                                        struct inet6_cork *v6_cork)
1710 {
1711         struct sk_buff *skb;
1712
1713         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1714                 if (skb_dst(skb))
1715                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1716                                       IPSTATS_MIB_OUTDISCARDS);
1717                 kfree_skb(skb);
1718         }
1719
1720         ip6_cork_release(cork, v6_cork);
1721 }
1722
1723 void ip6_flush_pending_frames(struct sock *sk)
1724 {
1725         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1726                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1727 }
1728 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1729
1730 struct sk_buff *ip6_make_skb(struct sock *sk,
1731                              int getfrag(void *from, char *to, int offset,
1732                                          int len, int odd, struct sk_buff *skb),
1733                              void *from, int length, int transhdrlen,
1734                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1735                              struct rt6_info *rt, unsigned int flags,
1736                              const struct sockcm_cookie *sockc)
1737 {
1738         struct inet_cork_full cork;
1739         struct inet6_cork v6_cork;
1740         struct sk_buff_head queue;
1741         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1742         int err;
1743
1744         if (flags & MSG_PROBE)
1745                 return NULL;
1746
1747         __skb_queue_head_init(&queue);
1748
1749         cork.base.flags = 0;
1750         cork.base.addr = 0;
1751         cork.base.opt = NULL;
1752         cork.base.dst = NULL;
1753         v6_cork.opt = NULL;
1754         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1755         if (err) {
1756                 ip6_cork_release(&cork, &v6_cork);
1757                 return ERR_PTR(err);
1758         }
1759         if (ipc6->dontfrag < 0)
1760                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1761
1762         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1763                                 &current->task_frag, getfrag, from,
1764                                 length + exthdrlen, transhdrlen + exthdrlen,
1765                                 flags, ipc6, sockc);
1766         if (err) {
1767                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1768                 return ERR_PTR(err);
1769         }
1770
1771         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1772 }