net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_is_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132 {
 133         int ret;
 134
 135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136         if (ret) {
 137                 kfree_skb(skb);
 138                 return ret;
 139         }
 140
 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 142         /* Policy lookup after SNAT yielded a new policy */
 143         if (skb_dst(skb)->xfrm) {
 144                 IPCB(skb)->flags |= IPSKB_REROUTED;
 145                 return dst_output(net, sk, skb);
 146         }
 147 #endif
 148
 149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 150             dst_allfrag(skb_dst(skb)) ||
 151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 153         else
 154                 return ip6_finish_output2(net, sk, skb);
 155 }
 156
 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 158 {
 159         struct net_device *dev = skb_dst(skb)->dev;
 160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 161
 162         skb->protocol = htons(ETH_P_IPV6);
 163         skb->dev = dev;
 164
 165         if (unlikely(idev->cnf.disable_ipv6)) {
 166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 167                 kfree_skb(skb);
 168                 return 0;
 169         }
 170
 171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 172                             net, sk, skb, NULL, dev,
 173                             ip6_finish_output,
 174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 175 }
 176
 177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 178 {
 179         if (!np->autoflowlabel_set)
 180                 return ip6_default_np_autolabel(net);
 181         else
 182                 return np->autoflowlabel;
 183 }
 184
 185 /*
 186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 187  * Note : socket lock is not held for SYNACK packets, but might be modified
 188  * by calls to skb_set_owner_w() and ipv6_local_error(),
 189  * which are using proper atomic operations or spinlocks.
 190  */
 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 193 {
 194         struct net *net = sock_net(sk);
 195         const struct ipv6_pinfo *np = inet6_sk(sk);
 196         struct in6_addr *first_hop = &fl6->daddr;
 197         struct dst_entry *dst = skb_dst(skb);
 198         struct ipv6hdr *hdr;
 199         u8  proto = fl6->flowi6_proto;
 200         int seg_len = skb->len;
 201         int hlimit = -1;
 202         u32 mtu;
 203
 204         if (opt) {
 205                 unsigned int head_room;
 206
 207                 /* First: exthdrs may take lots of space (~8K for now)
 208                    MAX_HEADER is not enough.
 209                  */
 210                 head_room = opt->opt_nflen + opt->opt_flen;
 211                 seg_len += head_room;
 212                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 213
 214                 if (skb_headroom(skb) < head_room) {
 215                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 216                         if (!skb2) {
 217                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 218                                               IPSTATS_MIB_OUTDISCARDS);
 219                                 kfree_skb(skb);
 220                                 return -ENOBUFS;
 221                         }
 222                         consume_skb(skb);
 223                         skb = skb2;
 224                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 225                          * it is safe to call in our context (socket lock not held)
 226                          */
 227                         skb_set_owner_w(skb, (struct sock *)sk);
 228                 }
 229                 if (opt->opt_flen)
 230                         ipv6_push_frag_opts(skb, opt, &proto);
 231                 if (opt->opt_nflen)
 232                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 233                                              &fl6->saddr);
 234         }
 235
 236         skb_push(skb, sizeof(struct ipv6hdr));
 237         skb_reset_network_header(skb);
 238         hdr = ipv6_hdr(skb);
 239
 240         /*
 241          *      Fill in the IPv6 header
 242          */
 243         if (np)
 244                 hlimit = np->hop_limit;
 245         if (hlimit < 0)
 246                 hlimit = ip6_dst_hoplimit(dst);
 247
 248         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 249                                 ip6_autoflowlabel(net, np), fl6));
 250
 251         hdr->payload_len = htons(seg_len);
 252         hdr->nexthdr = proto;
 253         hdr->hop_limit = hlimit;
 254
 255         hdr->saddr = fl6->saddr;
 256         hdr->daddr = *first_hop;
 257
 258         skb->protocol = htons(ETH_P_IPV6);
 259         skb->priority = sk->sk_priority;
 260         skb->mark = mark;
 261
 262         mtu = dst_mtu(dst);
 263         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 264                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 265                               IPSTATS_MIB_OUT, skb->len);
 266
 267                 /* if egress device is enslaved to an L3 master device pass the
 268                  * skb to its handler for processing
 269                  */
 270                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 271                 if (unlikely(!skb))
 272                         return 0;
 273
 274                 /* hooks should never assume socket lock is held.
 275                  * we promote our socket to non const
 276                  */
 277                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 278                                net, (struct sock *)sk, skb, NULL, dst->dev,
 279                                dst_output);
 280         }
 281
 282         skb->dev = dst->dev;
 283         /* ipv6_local_error() does not require socket lock,
 284          * we promote our socket to non const
 285          */
 286         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 287
 288         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 289         kfree_skb(skb);
 290         return -EMSGSIZE;
 291 }
 292 EXPORT_SYMBOL(ip6_xmit);
 293
 294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 295 {
 296         struct ip6_ra_chain *ra;
 297         struct sock *last = NULL;
 298
 299         read_lock(&ip6_ra_lock);
 300         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 301                 struct sock *sk = ra->sk;
 302                 if (sk && ra->sel == sel &&
 303                     (!sk->sk_bound_dev_if ||
 304                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 305                         if (last) {
 306                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 307                                 if (skb2)
 308                                         rawv6_rcv(last, skb2);
 309                         }
 310                         last = sk;
 311                 }
 312         }
 313
 314         if (last) {
 315                 rawv6_rcv(last, skb);
 316                 read_unlock(&ip6_ra_lock);
 317                 return 1;
 318         }
 319         read_unlock(&ip6_ra_lock);
 320         return 0;
 321 }
 322
 323 static int ip6_forward_proxy_check(struct sk_buff *skb)
 324 {
 325         struct ipv6hdr *hdr = ipv6_hdr(skb);
 326         u8 nexthdr = hdr->nexthdr;
 327         __be16 frag_off;
 328         int offset;
 329
 330         if (ipv6_ext_hdr(nexthdr)) {
 331                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 332                 if (offset < 0)
 333                         return 0;
 334         } else
 335                 offset = sizeof(struct ipv6hdr);
 336
 337         if (nexthdr == IPPROTO_ICMPV6) {
 338                 struct icmp6hdr *icmp6;
 339
 340                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 341                                          offset + 1 - skb->data)))
 342                         return 0;
 343
 344                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 345
 346                 switch (icmp6->icmp6_type) {
 347                 case NDISC_ROUTER_SOLICITATION:
 348                 case NDISC_ROUTER_ADVERTISEMENT:
 349                 case NDISC_NEIGHBOUR_SOLICITATION:
 350                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 351                 case NDISC_REDIRECT:
 352                         /* For reaction involving unicast neighbor discovery
 353                          * message destined to the proxied address, pass it to
 354                          * input function.
 355                          */
 356                         return 1;
 357                 default:
 358                         break;
 359                 }
 360         }
 361
 362         /*
 363          * The proxying router can't forward traffic sent to a link-local
 364          * address, so signal the sender and discard the packet. This
 365          * behavior is clarified by the MIPv6 specification.
 366          */
 367         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 368                 dst_link_failure(skb);
 369                 return -1;
 370         }
 371
 372         return 0;
 373 }
 374
 375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 376                                      struct sk_buff *skb)
 377 {
 378         struct dst_entry *dst = skb_dst(skb);
 379
 380         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 381         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 382
 383         return dst_output(net, sk, skb);
 384 }
 385
 386 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 387 {
 388         if (skb->len <= mtu)
 389                 return false;
 390
 391         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 392         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 393                 return true;
 394
 395         if (skb->ignore_df)
 396                 return false;
 397
 398         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 399                 return false;
 400
 401         return true;
 402 }
 403
 404 int ip6_forward(struct sk_buff *skb)
 405 {
 406         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 407         struct dst_entry *dst = skb_dst(skb);
 408         struct ipv6hdr *hdr = ipv6_hdr(skb);
 409         struct inet6_skb_parm *opt = IP6CB(skb);
 410         struct net *net = dev_net(dst->dev);
 411         u32 mtu;
 412
 413         if (net->ipv6.devconf_all->forwarding == 0)
 414                 goto error;
 415
 416         if (skb->pkt_type != PACKET_HOST)
 417                 goto drop;
 418
 419         if (unlikely(skb->sk))
 420                 goto drop;
 421
 422         if (skb_warn_if_lro(skb))
 423                 goto drop;
 424
 425         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 426                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 427                 goto drop;
 428         }
 429
 430         skb_forward_csum(skb);
 431
 432         /*
 433          *      We DO NOT make any processing on
 434          *      RA packets, pushing them to user level AS IS
 435          *      without ane WARRANTY that application will be able
 436          *      to interpret them. The reason is that we
 437          *      cannot make anything clever here.
 438          *
 439          *      We are not end-node, so that if packet contains
 440          *      AH/ESP, we cannot make anything.
 441          *      Defragmentation also would be mistake, RA packets
 442          *      cannot be fragmented, because there is no warranty
 443          *      that different fragments will go along one path. --ANK
 444          */
 445         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 446                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 447                         return 0;
 448         }
 449
 450         /*
 451          *      check and decrement ttl
 452          */
 453         if (hdr->hop_limit <= 1) {
 454                 /* Force OUTPUT device used as source address */
 455                 skb->dev = dst->dev;
 456                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 457                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 458
 459                 kfree_skb(skb);
 460                 return -ETIMEDOUT;
 461         }
 462
 463         /* XXX: idev->cnf.proxy_ndp? */
 464         if (net->ipv6.devconf_all->proxy_ndp &&
 465             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 466                 int proxied = ip6_forward_proxy_check(skb);
 467                 if (proxied > 0)
 468                         return ip6_input(skb);
 469                 else if (proxied < 0) {
 470                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 471                         goto drop;
 472                 }
 473         }
 474
 475         if (!xfrm6_route_forward(skb)) {
 476                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 477                 goto drop;
 478         }
 479         dst = skb_dst(skb);
 480
 481         /* IPv6 specs say nothing about it, but it is clear that we cannot
 482            send redirects to source routed frames.
 483            We don't send redirects to frames decapsulated from IPsec.
 484          */
 485         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 486             opt->srcrt == 0 && !skb_sec_path(skb)) {
 487                 struct in6_addr *target = NULL;
 488                 struct inet_peer *peer;
 489                 struct rt6_info *rt;
 490
 491                 /*
 492                  *      incoming and outgoing devices are the same
 493                  *      send a redirect.
 494                  */
 495
 496                 rt = (struct rt6_info *) dst;
 497                 if (rt->rt6i_flags & RTF_GATEWAY)
 498                         target = &rt->rt6i_gateway;
 499                 else
 500                         target = &hdr->daddr;
 501
 502                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 503
 504                 /* Limit redirects both by destination (here)
 505                    and by source (inside ndisc_send_redirect)
 506                  */
 507                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 508                         ndisc_send_redirect(skb, target);
 509                 if (peer)
 510                         inet_putpeer(peer);
 511         } else {
 512                 int addrtype = ipv6_addr_type(&hdr->saddr);
 513
 514                 /* This check is security critical. */
 515                 if (addrtype == IPV6_ADDR_ANY ||
 516                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 517                         goto error;
 518                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 519                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 520                                     ICMPV6_NOT_NEIGHBOUR, 0);
 521                         goto error;
 522                 }
 523         }
 524
 525         mtu = ip6_dst_mtu_forward(dst);
 526         if (mtu < IPV6_MIN_MTU)
 527                 mtu = IPV6_MIN_MTU;
 528
 529         if (ip6_pkt_too_big(skb, mtu)) {
 530                 /* Again, force OUTPUT device used as source address */
 531                 skb->dev = dst->dev;
 532                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 533                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 534                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 535                                 IPSTATS_MIB_FRAGFAILS);
 536                 kfree_skb(skb);
 537                 return -EMSGSIZE;
 538         }
 539
 540         if (skb_cow(skb, dst->dev->hard_header_len)) {
 541                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 542                                 IPSTATS_MIB_OUTDISCARDS);
 543                 goto drop;
 544         }
 545
 546         hdr = ipv6_hdr(skb);
 547
 548         /* Mangling hops number delayed to point after skb COW */
 549
 550         hdr->hop_limit--;
 551
 552         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 553                        net, NULL, skb, skb->dev, dst->dev,
 554                        ip6_forward_finish);
 555
 556 error:
 557         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 558 drop:
 559         kfree_skb(skb);
 560         return -EINVAL;
 561 }
 562
 563 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 564 {
 565         to->pkt_type = from->pkt_type;
 566         to->priority = from->priority;
 567         to->protocol = from->protocol;
 568         skb_dst_drop(to);
 569         skb_dst_set(to, dst_clone(skb_dst(from)));
 570         to->dev = from->dev;
 571         to->mark = from->mark;
 572
 573         skb_copy_hash(to, from);
 574
 575 #ifdef CONFIG_NET_SCHED
 576         to->tc_index = from->tc_index;
 577 #endif
 578         nf_copy(to, from);
 579         skb_copy_secmark(to, from);
 580 }
 581
 582 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 583                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 584 {
 585         struct sk_buff *frag;
 586         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 587         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 588                                 inet6_sk(skb->sk) : NULL;
 589         struct ipv6hdr *tmp_hdr;
 590         struct frag_hdr *fh;
 591         unsigned int mtu, hlen, left, len;
 592         int hroom, troom;
 593         __be32 frag_id;
 594         int ptr, offset = 0, err = 0;
 595         u8 *prevhdr, nexthdr = 0;
 596
 597         err = ip6_find_1stfragopt(skb, &prevhdr);
 598         if (err < 0)
 599                 goto fail;
 600         hlen = err;
 601         nexthdr = *prevhdr;
 602
 603         mtu = ip6_skb_dst_mtu(skb);
 604
 605         /* We must not fragment if the socket is set to force MTU discovery
 606          * or if the skb it not generated by a local socket.
 607          */
 608         if (unlikely(!skb->ignore_df && skb->len > mtu))
 609                 goto fail_toobig;
 610
 611         if (IP6CB(skb)->frag_max_size) {
 612                 if (IP6CB(skb)->frag_max_size > mtu)
 613                         goto fail_toobig;
 614
 615                 /* don't send fragments larger than what we received */
 616                 mtu = IP6CB(skb)->frag_max_size;
 617                 if (mtu < IPV6_MIN_MTU)
 618                         mtu = IPV6_MIN_MTU;
 619         }
 620
 621         if (np && np->frag_size < mtu) {
 622                 if (np->frag_size)
 623                         mtu = np->frag_size;
 624         }
 625         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 626                 goto fail_toobig;
 627         mtu -= hlen + sizeof(struct frag_hdr);
 628
 629         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 630                                     &ipv6_hdr(skb)->saddr);
 631
 632         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 633             (err = skb_checksum_help(skb)))
 634                 goto fail;
 635
 636         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 637         if (skb_has_frag_list(skb)) {
 638                 unsigned int first_len = skb_pagelen(skb);
 639                 struct sk_buff *frag2;
 640
 641                 if (first_len - hlen > mtu ||
 642                     ((first_len - hlen) & 7) ||
 643                     skb_cloned(skb) ||
 644                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 645                         goto slow_path;
 646
 647                 skb_walk_frags(skb, frag) {
 648                         /* Correct geometry. */
 649                         if (frag->len > mtu ||
 650                             ((frag->len & 7) && frag->next) ||
 651                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 652                                 goto slow_path_clean;
 653
 654                         /* Partially cloned skb? */
 655                         if (skb_shared(frag))
 656                                 goto slow_path_clean;
 657
 658                         BUG_ON(frag->sk);
 659                         if (skb->sk) {
 660                                 frag->sk = skb->sk;
 661                                 frag->destructor = sock_wfree;
 662                         }
 663                         skb->truesize -= frag->truesize;
 664                 }
 665
 666                 err = 0;
 667                 offset = 0;
 668                 /* BUILD HEADER */
 669
 670                 *prevhdr = NEXTHDR_FRAGMENT;
 671                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 672                 if (!tmp_hdr) {
 673                         err = -ENOMEM;
 674                         goto fail;
 675                 }
 676                 frag = skb_shinfo(skb)->frag_list;
 677                 skb_frag_list_init(skb);
 678
 679                 __skb_pull(skb, hlen);
 680                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 681                 __skb_push(skb, hlen);
 682                 skb_reset_network_header(skb);
 683                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 684
 685                 fh->nexthdr = nexthdr;
 686                 fh->reserved = 0;
 687                 fh->frag_off = htons(IP6_MF);
 688                 fh->identification = frag_id;
 689
 690                 first_len = skb_pagelen(skb);
 691                 skb->data_len = first_len - skb_headlen(skb);
 692                 skb->len = first_len;
 693                 ipv6_hdr(skb)->payload_len = htons(first_len -
 694                                                    sizeof(struct ipv6hdr));
 695
 696                 for (;;) {
 697                         /* Prepare header of the next frame,
 698                          * before previous one went down. */
 699                         if (frag) {
 700                                 frag->ip_summed = CHECKSUM_NONE;
 701                                 skb_reset_transport_header(frag);
 702                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 703                                 __skb_push(frag, hlen);
 704                                 skb_reset_network_header(frag);
 705                                 memcpy(skb_network_header(frag), tmp_hdr,
 706                                        hlen);
 707                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 708                                 fh->nexthdr = nexthdr;
 709                                 fh->reserved = 0;
 710                                 fh->frag_off = htons(offset);
 711                                 if (frag->next)
 712                                         fh->frag_off |= htons(IP6_MF);
 713                                 fh->identification = frag_id;
 714                                 ipv6_hdr(frag)->payload_len =
 715                                                 htons(frag->len -
 716                                                       sizeof(struct ipv6hdr));
 717                                 ip6_copy_metadata(frag, skb);
 718                         }
 719
 720                         err = output(net, sk, skb);
 721                         if (!err)
 722                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 723                                               IPSTATS_MIB_FRAGCREATES);
 724
 725                         if (err || !frag)
 726                                 break;
 727
 728                         skb = frag;
 729                         frag = skb->next;
 730                         skb->next = NULL;
 731                 }
 732
 733                 kfree(tmp_hdr);
 734
 735                 if (err == 0) {
 736                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 737                                       IPSTATS_MIB_FRAGOKS);
 738                         return 0;
 739                 }
 740
 741                 kfree_skb_list(frag);
 742
 743                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 744                               IPSTATS_MIB_FRAGFAILS);
 745                 return err;
 746
 747 slow_path_clean:
 748                 skb_walk_frags(skb, frag2) {
 749                         if (frag2 == frag)
 750                                 break;
 751                         frag2->sk = NULL;
 752                         frag2->destructor = NULL;
 753                         skb->truesize += frag2->truesize;
 754                 }
 755         }
 756
 757 slow_path:
 758         left = skb->len - hlen;         /* Space per frame */
 759         ptr = hlen;                     /* Where to start from */
 760
 761         /*
 762          *      Fragment the datagram.
 763          */
 764
 765         troom = rt->dst.dev->needed_tailroom;
 766
 767         /*
 768          *      Keep copying data until we run out.
 769          */
 770         while (left > 0)        {
 771                 u8 *fragnexthdr_offset;
 772
 773                 len = left;
 774                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 775                 if (len > mtu)
 776                         len = mtu;
 777                 /* IF: we are not sending up to and including the packet end
 778                    then align the next start on an eight byte boundary */
 779                 if (len < left) {
 780                         len &= ~7;
 781                 }
 782
 783                 /* Allocate buffer */
 784                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 785                                  hroom + troom, GFP_ATOMIC);
 786                 if (!frag) {
 787                         err = -ENOMEM;
 788                         goto fail;
 789                 }
 790
 791                 /*
 792                  *      Set up data on packet
 793                  */
 794
 795                 ip6_copy_metadata(frag, skb);
 796                 skb_reserve(frag, hroom);
 797                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 798                 skb_reset_network_header(frag);
 799                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 800                 frag->transport_header = (frag->network_header + hlen +
 801                                           sizeof(struct frag_hdr));
 802
 803                 /*
 804                  *      Charge the memory for the fragment to any owner
 805                  *      it might possess
 806                  */
 807                 if (skb->sk)
 808                         skb_set_owner_w(frag, skb->sk);
 809
 810                 /*
 811                  *      Copy the packet header into the new buffer.
 812                  */
 813                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 814
 815                 fragnexthdr_offset = skb_network_header(frag);
 816                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 817                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 818
 819                 /*
 820                  *      Build fragment header.
 821                  */
 822                 fh->nexthdr = nexthdr;
 823                 fh->reserved = 0;
 824                 fh->identification = frag_id;
 825
 826                 /*
 827                  *      Copy a block of the IP datagram.
 828                  */
 829                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 830                                      len));
 831                 left -= len;
 832
 833                 fh->frag_off = htons(offset);
 834                 if (left > 0)
 835                         fh->frag_off |= htons(IP6_MF);
 836                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 837                                                     sizeof(struct ipv6hdr));
 838
 839                 ptr += len;
 840                 offset += len;
 841
 842                 /*
 843                  *      Put this fragment into the sending queue.
 844                  */
 845                 err = output(net, sk, frag);
 846                 if (err)
 847                         goto fail;
 848
 849                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 850                               IPSTATS_MIB_FRAGCREATES);
 851         }
 852         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 853                       IPSTATS_MIB_FRAGOKS);
 854         consume_skb(skb);
 855         return err;
 856
 857 fail_toobig:
 858         if (skb->sk && dst_allfrag(skb_dst(skb)))
 859                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 860
 861         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 862         err = -EMSGSIZE;
 863
 864 fail:
 865         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 866                       IPSTATS_MIB_FRAGFAILS);
 867         kfree_skb(skb);
 868         return err;
 869 }
 870
 871 static inline int ip6_rt_check(const struct rt6key *rt_key,
 872                                const struct in6_addr *fl_addr,
 873                                const struct in6_addr *addr_cache)
 874 {
 875         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 876                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 877 }
 878
 879 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 880                                           struct dst_entry *dst,
 881                                           const struct flowi6 *fl6)
 882 {
 883         struct ipv6_pinfo *np = inet6_sk(sk);
 884         struct rt6_info *rt;
 885
 886         if (!dst)
 887                 goto out;
 888
 889         if (dst->ops->family != AF_INET6) {
 890                 dst_release(dst);
 891                 return NULL;
 892         }
 893
 894         rt = (struct rt6_info *)dst;
 895         /* Yes, checking route validity in not connected
 896          * case is not very simple. Take into account,
 897          * that we do not support routing by source, TOS,
 898          * and MSG_DONTROUTE            --ANK (980726)
 899          *
 900          * 1. ip6_rt_check(): If route was host route,
 901          *    check that cached destination is current.
 902          *    If it is network route, we still may
 903          *    check its validity using saved pointer
 904          *    to the last used address: daddr_cache.
 905          *    We do not want to save whole address now,
 906          *    (because main consumer of this service
 907          *    is tcp, which has not this problem),
 908          *    so that the last trick works only on connected
 909          *    sockets.
 910          * 2. oif also should be the same.
 911          */
 912         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 913 #ifdef CONFIG_IPV6_SUBTREES
 914             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 915 #endif
 916            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 917               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 918                 dst_release(dst);
 919                 dst = NULL;
 920         }
 921
 922 out:
 923         return dst;
 924 }
 925
 926 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 927                                struct dst_entry **dst, struct flowi6 *fl6)
 928 {
 929 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 930         struct neighbour *n;
 931         struct rt6_info *rt;
 932 #endif
 933         int err;
 934         int flags = 0;
 935
 936         /* The correct way to handle this would be to do
 937          * ip6_route_get_saddr, and then ip6_route_output; however,
 938          * the route-specific preferred source forces the
 939          * ip6_route_output call _before_ ip6_route_get_saddr.
 940          *
 941          * In source specific routing (no src=any default route),
 942          * ip6_route_output will fail given src=any saddr, though, so
 943          * that's why we try it again later.
 944          */
 945         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 946                 struct fib6_info *from;
 947                 struct rt6_info *rt;
 948                 bool had_dst = *dst != NULL;
 949
 950                 if (!had_dst)
 951                         *dst = ip6_route_output(net, sk, fl6);
 952                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 953
 954                 rcu_read_lock();
 955                 from = rt ? rcu_dereference(rt->from) : NULL;
 956                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
 957                                           sk ? inet6_sk(sk)->srcprefs : 0,
 958                                           &fl6->saddr);
 959                 rcu_read_unlock();
 960
 961                 if (err)
 962                         goto out_err_release;
 963
 964                 /* If we had an erroneous initial result, pretend it
 965                  * never existed and let the SA-enabled version take
 966                  * over.
 967                  */
 968                 if (!had_dst && (*dst)->error) {
 969                         dst_release(*dst);
 970                         *dst = NULL;
 971                 }
 972
 973                 if (fl6->flowi6_oif)
 974                         flags |= RT6_LOOKUP_F_IFACE;
 975         }
 976
 977         if (!*dst)
 978                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 979
 980         err = (*dst)->error;
 981         if (err)
 982                 goto out_err_release;
 983
 984 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 985         /*
 986          * Here if the dst entry we've looked up
 987          * has a neighbour entry that is in the INCOMPLETE
 988          * state and the src address from the flow is
 989          * marked as OPTIMISTIC, we release the found
 990          * dst entry and replace it instead with the
 991          * dst entry of the nexthop router
 992          */
 993         rt = (struct rt6_info *) *dst;
 994         rcu_read_lock_bh();
 995         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 996                                       rt6_nexthop(rt, &fl6->daddr));
 997         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 998         rcu_read_unlock_bh();
 999
1000         if (err) {
1001                 struct inet6_ifaddr *ifp;
1002                 struct flowi6 fl_gw6;
1003                 int redirect;
1004
1005                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1006                                       (*dst)->dev, 1);
1007
1008                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1009                 if (ifp)
1010                         in6_ifa_put(ifp);
1011
1012                 if (redirect) {
1013                         /*
1014                          * We need to get the dst entry for the
1015                          * default router instead
1016                          */
1017                         dst_release(*dst);
1018                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1019                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1020                         *dst = ip6_route_output(net, sk, &fl_gw6);
1021                         err = (*dst)->error;
1022                         if (err)
1023                                 goto out_err_release;
1024                 }
1025         }
1026 #endif
1027         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1028             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1029                 err = -EAFNOSUPPORT;
1030                 goto out_err_release;
1031         }
1032
1033         return 0;
1034
1035 out_err_release:
1036         dst_release(*dst);
1037         *dst = NULL;
1038
1039         if (err == -ENETUNREACH)
1040                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1041         return err;
1042 }
1043
1044 /**
1045  *      ip6_dst_lookup - perform route lookup on flow
1046  *      @sk: socket which provides route info
1047  *      @dst: pointer to dst_entry * for result
1048  *      @fl6: flow to lookup
1049  *
1050  *      This function performs a route lookup on the given flow.
1051  *
1052  *      It returns zero on success, or a standard errno code on error.
1053  */
1054 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1055                    struct flowi6 *fl6)
1056 {
1057         *dst = NULL;
1058         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1059 }
1060 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1061
1062 /**
1063  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1064  *      @sk: socket which provides route info
1065  *      @fl6: flow to lookup
1066  *      @final_dst: final destination address for ipsec lookup
1067  *
1068  *      This function performs a route lookup on the given flow.
1069  *
1070  *      It returns a valid dst pointer on success, or a pointer encoded
1071  *      error code.
1072  */
1073 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1074                                       const struct in6_addr *final_dst)
1075 {
1076         struct dst_entry *dst = NULL;
1077         int err;
1078
1079         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1080         if (err)
1081                 return ERR_PTR(err);
1082         if (final_dst)
1083                 fl6->daddr = *final_dst;
1084
1085         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1086 }
1087 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1088
1089 /**
1090  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1091  *      @sk: socket which provides the dst cache and route info
1092  *      @fl6: flow to lookup
1093  *      @final_dst: final destination address for ipsec lookup
1094  *      @connected: whether @sk is connected or not
1095  *
1096  *      This function performs a route lookup on the given flow with the
1097  *      possibility of using the cached route in the socket if it is valid.
1098  *      It will take the socket dst lock when operating on the dst cache.
1099  *      As a result, this function can only be used in process context.
1100  *
1101  *      In addition, for a connected socket, cache the dst in the socket
1102  *      if the current cache is not valid.
1103  *
1104  *      It returns a valid dst pointer on success, or a pointer encoded
1105  *      error code.
1106  */
1107 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1108                                          const struct in6_addr *final_dst,
1109                                          bool connected)
1110 {
1111         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1112
1113         dst = ip6_sk_dst_check(sk, dst, fl6);
1114         if (dst)
1115                 return dst;
1116
1117         dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1118         if (connected && !IS_ERR(dst))
1119                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1120
1121         return dst;
1122 }
1123 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1124
1125 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1126                                                gfp_t gfp)
1127 {
1128         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1129 }
1130
1131 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1132                                                 gfp_t gfp)
1133 {
1134         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1135 }
1136
1137 static void ip6_append_data_mtu(unsigned int *mtu,
1138                                 int *maxfraglen,
1139                                 unsigned int fragheaderlen,
1140                                 struct sk_buff *skb,
1141                                 struct rt6_info *rt,
1142                                 unsigned int orig_mtu)
1143 {
1144         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1145                 if (!skb) {
1146                         /* first fragment, reserve header_len */
1147                         *mtu = orig_mtu - rt->dst.header_len;
1148
1149                 } else {
1150                         /*
1151                          * this fragment is not first, the headers
1152                          * space is regarded as data space.
1153                          */
1154                         *mtu = orig_mtu;
1155                 }
1156                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1157                               + fragheaderlen - sizeof(struct frag_hdr);
1158         }
1159 }
1160
1161 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1162                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1163                           struct rt6_info *rt, struct flowi6 *fl6)
1164 {
1165         struct ipv6_pinfo *np = inet6_sk(sk);
1166         unsigned int mtu;
1167         struct ipv6_txoptions *opt = ipc6->opt;
1168
1169         /*
1170          * setup for corking
1171          */
1172         if (opt) {
1173                 if (WARN_ON(v6_cork->opt))
1174                         return -EINVAL;
1175
1176                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1177                 if (unlikely(!v6_cork->opt))
1178                         return -ENOBUFS;
1179
1180                 v6_cork->opt->tot_len = sizeof(*opt);
1181                 v6_cork->opt->opt_flen = opt->opt_flen;
1182                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1183
1184                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1185                                                     sk->sk_allocation);
1186                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1187                         return -ENOBUFS;
1188
1189                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1190                                                     sk->sk_allocation);
1191                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1192                         return -ENOBUFS;
1193
1194                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1195                                                    sk->sk_allocation);
1196                 if (opt->hopopt && !v6_cork->opt->hopopt)
1197                         return -ENOBUFS;
1198
1199                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1200                                                     sk->sk_allocation);
1201                 if (opt->srcrt && !v6_cork->opt->srcrt)
1202                         return -ENOBUFS;
1203
1204                 /* need source address above miyazawa*/
1205         }
1206         dst_hold(&rt->dst);
1207         cork->base.dst = &rt->dst;
1208         cork->fl.u.ip6 = *fl6;
1209         v6_cork->hop_limit = ipc6->hlimit;
1210         v6_cork->tclass = ipc6->tclass;
1211         if (rt->dst.flags & DST_XFRM_TUNNEL)
1212                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1213                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1214         else
1215                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1216                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1217         if (np->frag_size < mtu) {
1218                 if (np->frag_size)
1219                         mtu = np->frag_size;
1220         }
1221         if (mtu < IPV6_MIN_MTU)
1222                 return -EINVAL;
1223         cork->base.fragsize = mtu;
1224         cork->base.gso_size = sk->sk_type == SOCK_DGRAM &&
1225                               sk->sk_protocol == IPPROTO_UDP ? ipc6->gso_size : 0;
1226
1227         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1228                 cork->base.flags |= IPCORK_ALLFRAG;
1229         cork->base.length = 0;
1230
1231         return 0;
1232 }
1233
1234 static int __ip6_append_data(struct sock *sk,
1235                              struct flowi6 *fl6,
1236                              struct sk_buff_head *queue,
1237                              struct inet_cork *cork,
1238                              struct inet6_cork *v6_cork,
1239                              struct page_frag *pfrag,
1240                              int getfrag(void *from, char *to, int offset,
1241                                          int len, int odd, struct sk_buff *skb),
1242                              void *from, int length, int transhdrlen,
1243                              unsigned int flags, struct ipcm6_cookie *ipc6,
1244                              const struct sockcm_cookie *sockc)
1245 {
1246         struct sk_buff *skb, *skb_prev = NULL;
1247         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1248         int exthdrlen = 0;
1249         int dst_exthdrlen = 0;
1250         int hh_len;
1251         int copy;
1252         int err;
1253         int offset = 0;
1254         __u8 tx_flags = 0;
1255         u32 tskey = 0;
1256         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1257         struct ipv6_txoptions *opt = v6_cork->opt;
1258         int csummode = CHECKSUM_NONE;
1259         unsigned int maxnonfragsize, headersize;
1260         unsigned int wmem_alloc_delta = 0;
1261         bool paged;
1262
1263         skb = skb_peek_tail(queue);
1264         if (!skb) {
1265                 exthdrlen = opt ? opt->opt_flen : 0;
1266                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1267         }
1268
1269         paged = !!cork->gso_size;
1270         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1271         orig_mtu = mtu;
1272
1273         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1274
1275         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1276                         (opt ? opt->opt_nflen : 0);
1277         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1278                      sizeof(struct frag_hdr);
1279
1280         headersize = sizeof(struct ipv6hdr) +
1281                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1282                      (dst_allfrag(&rt->dst) ?
1283                       sizeof(struct frag_hdr) : 0) +
1284                      rt->rt6i_nfheader_len;
1285
1286         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1287          * the first fragment
1288          */
1289         if (headersize + transhdrlen > mtu)
1290                 goto emsgsize;
1291
1292         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1293             (sk->sk_protocol == IPPROTO_UDP ||
1294              sk->sk_protocol == IPPROTO_RAW)) {
1295                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1296                                 sizeof(struct ipv6hdr));
1297                 goto emsgsize;
1298         }
1299
1300         if (ip6_sk_ignore_df(sk))
1301                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1302         else
1303                 maxnonfragsize = mtu;
1304
1305         if (cork->length + length > maxnonfragsize - headersize) {
1306 emsgsize:
1307                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1308                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1309                 return -EMSGSIZE;
1310         }
1311
1312         /* CHECKSUM_PARTIAL only with no extension headers and when
1313          * we are not going to fragment
1314          */
1315         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1316             headersize == sizeof(struct ipv6hdr) &&
1317             length <= mtu - headersize &&
1318             (!(flags & MSG_MORE) || cork->gso_size) &&
1319             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1320                 csummode = CHECKSUM_PARTIAL;
1321
1322         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1323                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1324                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1325                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1326                         tskey = sk->sk_tskey++;
1327         }
1328
1329         /*
1330          * Let's try using as much space as possible.
1331          * Use MTU if total length of the message fits into the MTU.
1332          * Otherwise, we need to reserve fragment header and
1333          * fragment alignment (= 8-15 octects, in total).
1334          *
1335          * Note that we may need to "move" the data from the tail of
1336          * of the buffer to the new fragment when we split
1337          * the message.
1338          *
1339          * FIXME: It may be fragmented into multiple chunks
1340          *        at once if non-fragmentable extension headers
1341          *        are too large.
1342          * --yoshfuji
1343          */
1344
1345         cork->length += length;
1346         if (!skb)
1347                 goto alloc_new_skb;
1348
1349         while (length > 0) {
1350                 /* Check if the remaining data fits into current packet. */
1351                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1352                 if (copy < length)
1353                         copy = maxfraglen - skb->len;
1354
1355                 if (copy <= 0) {
1356                         char *data;
1357                         unsigned int datalen;
1358                         unsigned int fraglen;
1359                         unsigned int fraggap;
1360                         unsigned int alloclen;
1361                         unsigned int pagedlen = 0;
1362 alloc_new_skb:
1363                         /* There's no room in the current skb */
1364                         if (skb)
1365                                 fraggap = skb->len - maxfraglen;
1366                         else
1367                                 fraggap = 0;
1368                         /* update mtu and maxfraglen if necessary */
1369                         if (!skb || !skb_prev)
1370                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1371                                                     fragheaderlen, skb, rt,
1372                                                     orig_mtu);
1373
1374                         skb_prev = skb;
1375
1376                         /*
1377                          * If remaining data exceeds the mtu,
1378                          * we know we need more fragment(s).
1379                          */
1380                         datalen = length + fraggap;
1381
1382                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1383                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1384                         fraglen = datalen + fragheaderlen;
1385
1386                         if ((flags & MSG_MORE) &&
1387                             !(rt->dst.dev->features&NETIF_F_SG))
1388                                 alloclen = mtu;
1389                         else if (!paged)
1390                                 alloclen = fraglen;
1391                         else {
1392                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1393                                 pagedlen = fraglen - alloclen;
1394                         }
1395
1396                         alloclen += dst_exthdrlen;
1397
1398                         if (datalen != length + fraggap) {
1399                                 /*
1400                                  * this is not the last fragment, the trailer
1401                                  * space is regarded as data space.
1402                                  */
1403                                 datalen += rt->dst.trailer_len;
1404                         }
1405
1406                         alloclen += rt->dst.trailer_len;
1407                         fraglen = datalen + fragheaderlen;
1408
1409                         /*
1410                          * We just reserve space for fragment header.
1411                          * Note: this may be overallocation if the message
1412                          * (without MSG_MORE) fits into the MTU.
1413                          */
1414                         alloclen += sizeof(struct frag_hdr);
1415
1416                         copy = datalen - transhdrlen - fraggap - pagedlen;
1417                         if (copy < 0) {
1418                                 err = -EINVAL;
1419                                 goto error;
1420                         }
1421                         if (transhdrlen) {
1422                                 skb = sock_alloc_send_skb(sk,
1423                                                 alloclen + hh_len,
1424                                                 (flags & MSG_DONTWAIT), &err);
1425                         } else {
1426                                 skb = NULL;
1427                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1428                                     2 * sk->sk_sndbuf)
1429                                         skb = alloc_skb(alloclen + hh_len,
1430                                                         sk->sk_allocation);
1431                                 if (unlikely(!skb))
1432                                         err = -ENOBUFS;
1433                         }
1434                         if (!skb)
1435                                 goto error;
1436                         /*
1437                          *      Fill in the control structures
1438                          */
1439                         skb->protocol = htons(ETH_P_IPV6);
1440                         skb->ip_summed = csummode;
1441                         skb->csum = 0;
1442                         /* reserve for fragmentation and ipsec header */
1443                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1444                                     dst_exthdrlen);
1445
1446                         /* Only the initial fragment is time stamped */
1447                         skb_shinfo(skb)->tx_flags = tx_flags;
1448                         tx_flags = 0;
1449                         skb_shinfo(skb)->tskey = tskey;
1450                         tskey = 0;
1451
1452                         /*
1453                          *      Find where to start putting bytes
1454                          */
1455                         data = skb_put(skb, fraglen - pagedlen);
1456                         skb_set_network_header(skb, exthdrlen);
1457                         data += fragheaderlen;
1458                         skb->transport_header = (skb->network_header +
1459                                                  fragheaderlen);
1460                         if (fraggap) {
1461                                 skb->csum = skb_copy_and_csum_bits(
1462                                         skb_prev, maxfraglen,
1463                                         data + transhdrlen, fraggap, 0);
1464                                 skb_prev->csum = csum_sub(skb_prev->csum,
1465                                                           skb->csum);
1466                                 data += fraggap;
1467                                 pskb_trim_unique(skb_prev, maxfraglen);
1468                         }
1469                         if (copy > 0 &&
1470                             getfrag(from, data + transhdrlen, offset,
1471                                     copy, fraggap, skb) < 0) {
1472                                 err = -EFAULT;
1473                                 kfree_skb(skb);
1474                                 goto error;
1475                         }
1476
1477                         offset += copy;
1478                         length -= copy + transhdrlen;
1479                         transhdrlen = 0;
1480                         exthdrlen = 0;
1481                         dst_exthdrlen = 0;
1482
1483                         if ((flags & MSG_CONFIRM) && !skb_prev)
1484                                 skb_set_dst_pending_confirm(skb, 1);
1485
1486                         /*
1487                          * Put the packet on the pending queue
1488                          */
1489                         if (!skb->destructor) {
1490                                 skb->destructor = sock_wfree;
1491                                 skb->sk = sk;
1492                                 wmem_alloc_delta += skb->truesize;
1493                         }
1494                         __skb_queue_tail(queue, skb);
1495                         continue;
1496                 }
1497
1498                 if (copy > length)
1499                         copy = length;
1500
1501                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1502                     skb_tailroom(skb) >= copy) {
1503                         unsigned int off;
1504
1505                         off = skb->len;
1506                         if (getfrag(from, skb_put(skb, copy),
1507                                                 offset, copy, off, skb) < 0) {
1508                                 __skb_trim(skb, off);
1509                                 err = -EFAULT;
1510                                 goto error;
1511                         }
1512                 } else {
1513                         int i = skb_shinfo(skb)->nr_frags;
1514
1515                         err = -ENOMEM;
1516                         if (!sk_page_frag_refill(sk, pfrag))
1517                                 goto error;
1518
1519                         if (!skb_can_coalesce(skb, i, pfrag->page,
1520                                               pfrag->offset)) {
1521                                 err = -EMSGSIZE;
1522                                 if (i == MAX_SKB_FRAGS)
1523                                         goto error;
1524
1525                                 __skb_fill_page_desc(skb, i, pfrag->page,
1526                                                      pfrag->offset, 0);
1527                                 skb_shinfo(skb)->nr_frags = ++i;
1528                                 get_page(pfrag->page);
1529                         }
1530                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1531                         if (getfrag(from,
1532                                     page_address(pfrag->page) + pfrag->offset,
1533                                     offset, copy, skb->len, skb) < 0)
1534                                 goto error_efault;
1535
1536                         pfrag->offset += copy;
1537                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1538                         skb->len += copy;
1539                         skb->data_len += copy;
1540                         skb->truesize += copy;
1541                         wmem_alloc_delta += copy;
1542                 }
1543                 offset += copy;
1544                 length -= copy;
1545         }
1546
1547         if (wmem_alloc_delta)
1548                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1549         return 0;
1550
1551 error_efault:
1552         err = -EFAULT;
1553 error:
1554         cork->length -= length;
1555         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1556         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1557         return err;
1558 }
1559
1560 int ip6_append_data(struct sock *sk,
1561                     int getfrag(void *from, char *to, int offset, int len,
1562                                 int odd, struct sk_buff *skb),
1563                     void *from, int length, int transhdrlen,
1564                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1565                     struct rt6_info *rt, unsigned int flags,
1566                     const struct sockcm_cookie *sockc)
1567 {
1568         struct inet_sock *inet = inet_sk(sk);
1569         struct ipv6_pinfo *np = inet6_sk(sk);
1570         int exthdrlen;
1571         int err;
1572
1573         if (flags&MSG_PROBE)
1574                 return 0;
1575         if (skb_queue_empty(&sk->sk_write_queue)) {
1576                 /*
1577                  * setup for corking
1578                  */
1579                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1580                                      ipc6, rt, fl6);
1581                 if (err)
1582                         return err;
1583
1584                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1585                 length += exthdrlen;
1586                 transhdrlen += exthdrlen;
1587         } else {
1588                 fl6 = &inet->cork.fl.u.ip6;
1589                 transhdrlen = 0;
1590         }
1591
1592         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1593                                  &np->cork, sk_page_frag(sk), getfrag,
1594                                  from, length, transhdrlen, flags, ipc6, sockc);
1595 }
1596 EXPORT_SYMBOL_GPL(ip6_append_data);
1597
1598 static void ip6_cork_release(struct inet_cork_full *cork,
1599                              struct inet6_cork *v6_cork)
1600 {
1601         if (v6_cork->opt) {
1602                 kfree(v6_cork->opt->dst0opt);
1603                 kfree(v6_cork->opt->dst1opt);
1604                 kfree(v6_cork->opt->hopopt);
1605                 kfree(v6_cork->opt->srcrt);
1606                 kfree(v6_cork->opt);
1607                 v6_cork->opt = NULL;
1608         }
1609
1610         if (cork->base.dst) {
1611                 dst_release(cork->base.dst);
1612                 cork->base.dst = NULL;
1613                 cork->base.flags &= ~IPCORK_ALLFRAG;
1614         }
1615         memset(&cork->fl, 0, sizeof(cork->fl));
1616 }
1617
1618 struct sk_buff *__ip6_make_skb(struct sock *sk,
1619                                struct sk_buff_head *queue,
1620                                struct inet_cork_full *cork,
1621                                struct inet6_cork *v6_cork)
1622 {
1623         struct sk_buff *skb, *tmp_skb;
1624         struct sk_buff **tail_skb;
1625         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1626         struct ipv6_pinfo *np = inet6_sk(sk);
1627         struct net *net = sock_net(sk);
1628         struct ipv6hdr *hdr;
1629         struct ipv6_txoptions *opt = v6_cork->opt;
1630         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1631         struct flowi6 *fl6 = &cork->fl.u.ip6;
1632         unsigned char proto = fl6->flowi6_proto;
1633
1634         skb = __skb_dequeue(queue);
1635         if (!skb)
1636                 goto out;
1637         tail_skb = &(skb_shinfo(skb)->frag_list);
1638
1639         /* move skb->data to ip header from ext header */
1640         if (skb->data < skb_network_header(skb))
1641                 __skb_pull(skb, skb_network_offset(skb));
1642         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1643                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1644                 *tail_skb = tmp_skb;
1645                 tail_skb = &(tmp_skb->next);
1646                 skb->len += tmp_skb->len;
1647                 skb->data_len += tmp_skb->len;
1648                 skb->truesize += tmp_skb->truesize;
1649                 tmp_skb->destructor = NULL;
1650                 tmp_skb->sk = NULL;
1651         }
1652
1653         /* Allow local fragmentation. */
1654         skb->ignore_df = ip6_sk_ignore_df(sk);
1655
1656         *final_dst = fl6->daddr;
1657         __skb_pull(skb, skb_network_header_len(skb));
1658         if (opt && opt->opt_flen)
1659                 ipv6_push_frag_opts(skb, opt, &proto);
1660         if (opt && opt->opt_nflen)
1661                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1662
1663         skb_push(skb, sizeof(struct ipv6hdr));
1664         skb_reset_network_header(skb);
1665         hdr = ipv6_hdr(skb);
1666
1667         ip6_flow_hdr(hdr, v6_cork->tclass,
1668                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1669                                         ip6_autoflowlabel(net, np), fl6));
1670         hdr->hop_limit = v6_cork->hop_limit;
1671         hdr->nexthdr = proto;
1672         hdr->saddr = fl6->saddr;
1673         hdr->daddr = *final_dst;
1674
1675         skb->priority = sk->sk_priority;
1676         skb->mark = sk->sk_mark;
1677
1678         skb_dst_set(skb, dst_clone(&rt->dst));
1679         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1680         if (proto == IPPROTO_ICMPV6) {
1681                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1682
1683                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1684                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1685         }
1686
1687         ip6_cork_release(cork, v6_cork);
1688 out:
1689         return skb;
1690 }
1691
1692 int ip6_send_skb(struct sk_buff *skb)
1693 {
1694         struct net *net = sock_net(skb->sk);
1695         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1696         int err;
1697
1698         err = ip6_local_out(net, skb->sk, skb);
1699         if (err) {
1700                 if (err > 0)
1701                         err = net_xmit_errno(err);
1702                 if (err)
1703                         IP6_INC_STATS(net, rt->rt6i_idev,
1704                                       IPSTATS_MIB_OUTDISCARDS);
1705         }
1706
1707         return err;
1708 }
1709
1710 int ip6_push_pending_frames(struct sock *sk)
1711 {
1712         struct sk_buff *skb;
1713
1714         skb = ip6_finish_skb(sk);
1715         if (!skb)
1716                 return 0;
1717
1718         return ip6_send_skb(skb);
1719 }
1720 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1721
1722 static void __ip6_flush_pending_frames(struct sock *sk,
1723                                        struct sk_buff_head *queue,
1724                                        struct inet_cork_full *cork,
1725                                        struct inet6_cork *v6_cork)
1726 {
1727         struct sk_buff *skb;
1728
1729         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1730                 if (skb_dst(skb))
1731                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1732                                       IPSTATS_MIB_OUTDISCARDS);
1733                 kfree_skb(skb);
1734         }
1735
1736         ip6_cork_release(cork, v6_cork);
1737 }
1738
1739 void ip6_flush_pending_frames(struct sock *sk)
1740 {
1741         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1742                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1743 }
1744 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1745
1746 struct sk_buff *ip6_make_skb(struct sock *sk,
1747                              int getfrag(void *from, char *to, int offset,
1748                                          int len, int odd, struct sk_buff *skb),
1749                              void *from, int length, int transhdrlen,
1750                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1751                              struct rt6_info *rt, unsigned int flags,
1752                              struct inet_cork_full *cork,
1753                              const struct sockcm_cookie *sockc)
1754 {
1755         struct inet6_cork v6_cork;
1756         struct sk_buff_head queue;
1757         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1758         int err;
1759
1760         if (flags & MSG_PROBE)
1761                 return NULL;
1762
1763         __skb_queue_head_init(&queue);
1764
1765         cork->base.flags = 0;
1766         cork->base.addr = 0;
1767         cork->base.opt = NULL;
1768         cork->base.dst = NULL;
1769         v6_cork.opt = NULL;
1770         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1771         if (err) {
1772                 ip6_cork_release(cork, &v6_cork);
1773                 return ERR_PTR(err);
1774         }
1775         if (ipc6->dontfrag < 0)
1776                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1777
1778         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1779                                 &current->task_frag, getfrag, from,
1780                                 length + exthdrlen, transhdrlen + exthdrlen,
1781                                 flags, ipc6, sockc);
1782         if (err) {
1783                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1784                 return ERR_PTR(err);
1785         }
1786
1787         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1788 }