Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
authorDavid S. Miller <davem@davemloft.net>
Tue, 22 Aug 2017 00:06:42 +0000 (17:06 -0700)
committerDavid S. Miller <davem@davemloft.net>
Tue, 22 Aug 2017 00:06:42 +0000 (17:06 -0700)
20 files changed:
1  2 
MAINTAINERS
drivers/net/ethernet/mellanox/mlx4/main.c
drivers/net/ethernet/netronome/nfp/nfp_net_common.c
drivers/net/tun.c
include/net/ip.h
include/net/sch_generic.h
include/net/sock.h
kernel/events/core.c
net/core/datagram.c
net/ipv4/igmp.c
net/ipv4/route.c
net/ipv4/tcp_input.c
net/ipv4/udp.c
net/ipv6/ip6_fib.c
net/ipv6/udp.c
net/openvswitch/datapath.c
net/sched/cls_api.c
net/sctp/ipv6.c
net/unix/af_unix.c
tools/lib/bpf/libbpf.c

diff --combined MAINTAINERS
index 0e967b3ca1c612ad4b3e82d29a6a2cb26102df63,1c3feffb1c1cfd2b46685907300a9f026fb67e6a..11e1bcec9cbbe8c5840fead4a0e6e93e84a33538
@@@ -2477,7 -2477,7 +2477,7 @@@ Q:      https://patchwork.open-mesh.org/proj
  S:    Maintained
  F:    Documentation/ABI/testing/sysfs-class-net-batman-adv
  F:    Documentation/ABI/testing/sysfs-class-net-mesh
 -F:    Documentation/networking/batman-adv.txt
 +F:    Documentation/networking/batman-adv.rst
  F:    include/uapi/linux/batman_adv.h
  F:    net/batman-adv/
  
@@@ -5101,7 -5101,6 +5101,7 @@@ F:      include/linux/of_net.
  F:    include/linux/phy.h
  F:    include/linux/phy_fixed.h
  F:    include/linux/platform_data/mdio-gpio.h
 +F:    include/linux/platform_data/mdio-bcm-unimac.h
  F:    include/trace/events/mdio.h
  F:    include/uapi/linux/mdio.h
  F:    include/uapi/linux/mii.h
@@@ -6148,14 -6147,6 +6148,14 @@@ S:    Maintaine
  F:    drivers/net/ethernet/hisilicon/
  F:    Documentation/devicetree/bindings/net/hisilicon*.txt
  
 +HISILICON NETWORK SUBSYSTEM 3 DRIVER (HNS3)
 +M:    Yisen Zhuang <yisen.zhuang@huawei.com>
 +M:    Salil Mehta <salil.mehta@huawei.com>
 +L:    netdev@vger.kernel.org
 +W:    http://www.hisilicon.com
 +S:    Maintained
 +F:    drivers/net/ethernet/hisilicon/hns3/
 +
  HISILICON ROCE DRIVER
  M:    Lijun Ou <oulijun@huawei.com>
  M:    Wei Hu(Xavier) <xavier.huwei@huawei.com>
@@@ -6266,7 -6257,6 +6266,7 @@@ M:      Haiyang Zhang <haiyangz@microsoft.co
  M:    Stephen Hemminger <sthemmin@microsoft.com>
  L:    devel@linuxdriverproject.org
  S:    Maintained
 +F:    Documentation/networking/netvsc.txt
  F:    arch/x86/include/asm/mshyperv.h
  F:    arch/x86/include/uapi/asm/hyperv.h
  F:    arch/x86/kernel/cpu/mshyperv.c
@@@ -7120,7 -7110,6 +7120,6 @@@ M:      Marc Zyngier <marc.zyngier@arm.com
  L:    linux-kernel@vger.kernel.org
  S:    Maintained
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core
- T:    git git://git.infradead.org/users/jcooper/linux.git irqchip/core
  F:    Documentation/devicetree/bindings/interrupt-controller/
  F:    drivers/irqchip/
  
@@@ -8434,9 -8423,7 +8433,9 @@@ F:      include/uapi/linux/uvcvideo.
  
  MEDIATEK ETHERNET DRIVER
  M:    Felix Fietkau <nbd@openwrt.org>
 -M:    John Crispin <blogic@openwrt.org>
 +M:    John Crispin <john@phrozen.org>
 +M:    Sean Wang <sean.wang@mediatek.com>
 +M:    Nelson Chang <nelson.chang@mediatek.com>
  L:    netdev@vger.kernel.org
  S:    Maintained
  F:    drivers/net/ethernet/mediatek/
index 3797491f4b6bfb7da67e1be2cacceaa5b5c0d069,5fe5cdc5135776abb8fd4df748b0948520504b48..9ea2b0db62290b015dd457c76f14295c67923270
@@@ -432,7 -432,7 +432,7 @@@ static int mlx4_dev_cap(struct mlx4_de
                /* Virtual PCI function needs to determine UAR page size from
                 * firmware. Only master PCI function can set the uar page size
                 */
-               if (enable_4k_uar)
+               if (enable_4k_uar || !dev->persist->num_vfs)
                        dev->uar_page_shift = DEFAULT_UAR_PAGE_SHIFT;
                else
                        dev->uar_page_shift = PAGE_SHIFT;
@@@ -925,10 -925,10 +925,10 @@@ static int mlx4_slave_cap(struct mlx4_d
        mlx4_replace_zero_macs(dev);
  
        dev->caps.qp0_qkey = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
 -      dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
 -      dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
 -      dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
 -      dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
 +      dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
 +      dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
 +      dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
 +      dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
  
        if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy ||
            !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy ||
@@@ -2277,7 -2277,7 +2277,7 @@@ static int mlx4_init_hca(struct mlx4_de
  
                dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1;
  
-               if (enable_4k_uar) {
+               if (enable_4k_uar || !dev->persist->num_vfs) {
                        init_hca.log_uar_sz = ilog2(dev->caps.num_uars) +
                                                    PAGE_SHIFT - DEFAULT_UAR_PAGE_SHIFT;
                        init_hca.uar_page_sz = DEFAULT_UAR_PAGE_SHIFT - 12;
                dev->caps.rx_checksum_flags_port[2] = params.rx_csum_flags_port_2;
        }
        priv->eq_table.inta_pin = adapter.inta_pin;
 -      memcpy(dev->board_id, adapter.board_id, sizeof dev->board_id);
 +      memcpy(dev->board_id, adapter.board_id, sizeof(dev->board_id));
  
        return 0;
  
@@@ -2869,7 -2869,7 +2869,7 @@@ static void mlx4_enable_msi_x(struct ml
                                dev->caps.num_eqs - dev->caps.reserved_eqs,
                                MAX_MSIX);
  
 -              entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL);
 +              entries = kcalloc(nreq, sizeof(*entries), GFP_KERNEL);
                if (!entries)
                        goto no_msi;
  
@@@ -3782,6 -3782,7 +3782,6 @@@ err_release_regions
  
  err_disable_pdev:
        mlx4_pci_disable_device(&priv->dev);
 -      pci_set_drvdata(pdev, NULL);
        return err;
  }
  
@@@ -3996,6 -3997,7 +3996,6 @@@ static void mlx4_remove_one(struct pci_
        devlink_unregister(devlink);
        kfree(dev->persist);
        devlink_free(devlink);
 -      pci_set_drvdata(pdev, NULL);
  }
  
  static int restore_current_port_types(struct mlx4_dev *dev,
index 4a990033c4d5702dd9ca55862aae0ec997b2973c,9f77ce038a4a339260e0118bb6a9f61e60547870..732f1d315fba55dbbca9b6fc6118f08cc6bc676b
@@@ -908,8 -908,7 +908,7 @@@ static int nfp_net_tx(struct sk_buff *s
        return NETDEV_TX_OK;
  
  err_unmap:
-       --f;
-       while (f >= 0) {
+       while (--f >= 0) {
                frag = &skb_shinfo(skb)->frags[f];
                dma_unmap_page(dp->dev, tx_ring->txbufs[wr_idx].dma_addr,
                               skb_frag_size(frag), DMA_TO_DEVICE);
@@@ -2660,7 -2659,6 +2659,7 @@@ static int nfp_net_netdev_close(struct 
        /* Step 2: Tell NFP
         */
        nfp_net_clear_config_and_disable(nn);
 +      nfp_port_configure(netdev, false);
  
        /* Step 3: Free resources
         */
@@@ -2778,21 -2776,16 +2777,21 @@@ static int nfp_net_netdev_open(struct n
                goto err_free_all;
  
        /* Step 2: Configure the NFP
 +       * - Ifup the physical interface if it exists
         * - Enable rings from 0 to tx_rings/rx_rings - 1.
         * - Write MAC address (in case it changed)
         * - Set the MTU
         * - Set the Freelist buffer size
         * - Enable the FW
         */
 -      err = nfp_net_set_config_and_enable(nn);
 +      err = nfp_port_configure(netdev, true);
        if (err)
                goto err_free_all;
  
 +      err = nfp_net_set_config_and_enable(nn);
 +      if (err)
 +              goto err_port_disable;
 +
        /* Step 3: Enable for kernel
         * - put some freelist descriptors on each RX ring
         * - enable NAPI on each ring
  
        return 0;
  
 +err_port_disable:
 +      nfp_port_configure(netdev, false);
  err_free_all:
        nfp_net_close_free_all(nn);
        return err;
diff --combined drivers/net/tun.c
index 19cbbbb1b63bbd74fcf7e74faf6541b3b7472b65,0a2c0a42283f780b947fcf52fcb4220e73f0fa2f..06e8f0bb2dab07b01c8d10737fc9cf30c489bd56
@@@ -73,8 -73,6 +73,8 @@@
  #include <linux/seq_file.h>
  #include <linux/uio.h>
  #include <linux/skb_array.h>
 +#include <linux/bpf.h>
 +#include <linux/bpf_trace.h>
  
  #include <linux/uaccess.h>
  
@@@ -107,9 -105,6 +107,9 @@@ do {                                                               
  } while (0)
  #endif
  
 +#define TUN_HEADROOM 256
 +#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD + TUN_HEADROOM)
 +
  /* TUN device flags */
  
  /* IFF_ATTACH_QUEUE is never stored in device flags,
@@@ -204,7 -199,7 +204,7 @@@ struct tun_struct 
        struct net_device       *dev;
        netdev_features_t       set_features;
  #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
 -                        NETIF_F_TSO6|NETIF_F_UFO)
 +                        NETIF_F_TSO6)
  
        int                     align;
        int                     vnet_hdr_sz;
        u32 flow_count;
        u32 rx_batched;
        struct tun_pcpu_stats __percpu *pcpu_stats;
 +      struct bpf_prog __rcu *xdp_prog;
  };
  
  #ifdef CONFIG_TUN_VNET_CROSS_LE
@@@ -591,7 -585,6 +591,7 @@@ static void tun_detach(struct tun_file 
  static void tun_detach_all(struct net_device *dev)
  {
        struct tun_struct *tun = netdev_priv(dev);
 +      struct bpf_prog *xdp_prog = rtnl_dereference(tun->xdp_prog);
        struct tun_file *tfile, *tmp;
        int i, n = tun->numqueues;
  
        }
        BUG_ON(tun->numdisabled != 0);
  
 +      if (xdp_prog)
 +              bpf_prog_put(xdp_prog);
 +
        if (tun->flags & IFF_PERSIST)
                module_put(THIS_MODULE);
  }
@@@ -902,7 -892,7 +902,7 @@@ static netdev_tx_t tun_net_xmit(struct 
            sk_filter(tfile->socket.sk, skb))
                goto drop;
  
 -      if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
 +      if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
                goto drop;
  
        skb_tx_timestamp(skb);
@@@ -1013,46 -1003,6 +1013,46 @@@ tun_net_get_stats64(struct net_device *
        stats->tx_dropped = tx_dropped;
  }
  
 +static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 +                     struct netlink_ext_ack *extack)
 +{
 +      struct tun_struct *tun = netdev_priv(dev);
 +      struct bpf_prog *old_prog;
 +
 +      old_prog = rtnl_dereference(tun->xdp_prog);
 +      rcu_assign_pointer(tun->xdp_prog, prog);
 +      if (old_prog)
 +              bpf_prog_put(old_prog);
 +
 +      return 0;
 +}
 +
 +static u32 tun_xdp_query(struct net_device *dev)
 +{
 +      struct tun_struct *tun = netdev_priv(dev);
 +      const struct bpf_prog *xdp_prog;
 +
 +      xdp_prog = rtnl_dereference(tun->xdp_prog);
 +      if (xdp_prog)
 +              return xdp_prog->aux->id;
 +
 +      return 0;
 +}
 +
 +static int tun_xdp(struct net_device *dev, struct netdev_xdp *xdp)
 +{
 +      switch (xdp->command) {
 +      case XDP_SETUP_PROG:
 +              return tun_xdp_set(dev, xdp->prog, xdp->extack);
 +      case XDP_QUERY_PROG:
 +              xdp->prog_id = tun_xdp_query(dev);
 +              xdp->prog_attached = !!xdp->prog_id;
 +              return 0;
 +      default:
 +              return -EINVAL;
 +      }
 +}
 +
  static const struct net_device_ops tun_netdev_ops = {
        .ndo_uninit             = tun_net_uninit,
        .ndo_open               = tun_net_open,
@@@ -1083,7 -1033,6 +1083,7 @@@ static const struct net_device_ops tap_
        .ndo_features_check     = passthru_features_check,
        .ndo_set_rx_headroom    = tun_set_headroom,
        .ndo_get_stats64        = tun_net_get_stats64,
 +      .ndo_xdp                = tun_xdp,
  };
  
  static void tun_flow_init(struct tun_struct *tun)
@@@ -1241,128 -1190,6 +1241,128 @@@ static void tun_rx_batched(struct tun_s
        }
  }
  
 +static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
 +                            int len, int noblock, bool zerocopy)
 +{
 +      if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
 +              return false;
 +
 +      if (tfile->socket.sk->sk_sndbuf != INT_MAX)
 +              return false;
 +
 +      if (!noblock)
 +              return false;
 +
 +      if (zerocopy)
 +              return false;
 +
 +      if (SKB_DATA_ALIGN(len + TUN_RX_PAD) +
 +          SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
 +              return false;
 +
 +      return true;
 +}
 +
 +static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 +                                   struct tun_file *tfile,
 +                                   struct iov_iter *from,
 +                                   struct virtio_net_hdr *hdr,
 +                                   int len, int *generic_xdp)
 +{
 +      struct page_frag *alloc_frag = &current->task_frag;
 +      struct sk_buff *skb;
 +      struct bpf_prog *xdp_prog;
 +      int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
 +                   SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 +      unsigned int delta = 0;
 +      char *buf;
 +      size_t copied;
 +      bool xdp_xmit = false;
 +      int err;
 +
 +      if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
 +              return ERR_PTR(-ENOMEM);
 +
 +      buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
 +      copied = copy_page_from_iter(alloc_frag->page,
 +                                   alloc_frag->offset + TUN_RX_PAD,
 +                                   len, from);
 +      if (copied != len)
 +              return ERR_PTR(-EFAULT);
 +
 +      if (hdr->gso_type)
 +              *generic_xdp = 1;
 +      else
 +              *generic_xdp = 0;
 +
 +      rcu_read_lock();
 +      xdp_prog = rcu_dereference(tun->xdp_prog);
 +      if (xdp_prog && !*generic_xdp) {
 +              struct xdp_buff xdp;
 +              void *orig_data;
 +              u32 act;
 +
 +              xdp.data_hard_start = buf;
 +              xdp.data = buf + TUN_RX_PAD;
 +              xdp.data_end = xdp.data + len;
 +              orig_data = xdp.data;
 +              act = bpf_prog_run_xdp(xdp_prog, &xdp);
 +
 +              switch (act) {
 +              case XDP_REDIRECT:
 +                      get_page(alloc_frag->page);
 +                      alloc_frag->offset += buflen;
 +                      err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
 +                      if (err)
 +                              goto err_redirect;
 +                      return NULL;
 +              case XDP_TX:
 +                      xdp_xmit = true;
 +                      /* fall through */
 +              case XDP_PASS:
 +                      delta = orig_data - xdp.data;
 +                      break;
 +              default:
 +                      bpf_warn_invalid_xdp_action(act);
 +                      /* fall through */
 +              case XDP_ABORTED:
 +                      trace_xdp_exception(tun->dev, xdp_prog, act);
 +                      /* fall through */
 +              case XDP_DROP:
 +                      goto err_xdp;
 +              }
 +      }
 +
 +      skb = build_skb(buf, buflen);
 +      if (!skb) {
 +              rcu_read_unlock();
 +              return ERR_PTR(-ENOMEM);
 +      }
 +
 +      skb_reserve(skb, TUN_RX_PAD - delta);
 +      skb_put(skb, len + delta);
 +      get_page(alloc_frag->page);
 +      alloc_frag->offset += buflen;
 +
 +      if (xdp_xmit) {
 +              skb->dev = tun->dev;
 +              generic_xdp_tx(skb, xdp_prog);
 +              rcu_read_lock();
 +              return NULL;
 +      }
 +
 +      rcu_read_unlock();
 +
 +      return skb;
 +
 +err_redirect:
 +      put_page(alloc_frag->page);
 +err_xdp:
 +      rcu_read_unlock();
 +      this_cpu_inc(tun->pcpu_stats->rx_dropped);
 +      return NULL;
 +}
 +
  /* Get packet from user space buffer */
  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
                            void *msg_control, struct iov_iter *from,
        bool zerocopy = false;
        int err;
        u32 rxhash;
 +      int generic_xdp = 1;
  
        if (!(tun->dev->flags & IFF_UP))
                return -EIO;
                        zerocopy = true;
        }
  
 -      if (!zerocopy) {
 -              copylen = len;
 -              if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
 -                      linear = good_linear;
 -              else
 -                      linear = tun16_to_cpu(tun, gso.hdr_len);
 -      }
 -
 -      skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
 -      if (IS_ERR(skb)) {
 -              if (PTR_ERR(skb) != -EAGAIN)
 +      if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
 +              skb = tun_build_skb(tun, tfile, from, &gso, len, &generic_xdp);
 +              if (IS_ERR(skb)) {
                        this_cpu_inc(tun->pcpu_stats->rx_dropped);
 -              return PTR_ERR(skb);
 -      }
 +                      return PTR_ERR(skb);
 +              }
 +              if (!skb)
 +                      return total_len;
 +      } else {
 +              if (!zerocopy) {
 +                      copylen = len;
 +                      if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
 +                              linear = good_linear;
 +                      else
 +                              linear = tun16_to_cpu(tun, gso.hdr_len);
 +              }
  
 -      if (zerocopy)
 -              err = zerocopy_sg_from_iter(skb, from);
 -      else
 -              err = skb_copy_datagram_from_iter(skb, 0, from, len);
 +              skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
 +              if (IS_ERR(skb)) {
 +                      if (PTR_ERR(skb) != -EAGAIN)
 +                              this_cpu_inc(tun->pcpu_stats->rx_dropped);
 +                      return PTR_ERR(skb);
 +              }
  
 -      if (err) {
 -              this_cpu_inc(tun->pcpu_stats->rx_dropped);
 -              kfree_skb(skb);
 -              return -EFAULT;
 +              if (zerocopy)
 +                      err = zerocopy_sg_from_iter(skb, from);
 +              else
 +                      err = skb_copy_datagram_from_iter(skb, 0, from, len);
 +
 +              if (err) {
 +                      this_cpu_inc(tun->pcpu_stats->rx_dropped);
 +                      kfree_skb(skb);
 +                      return -EFAULT;
 +              }
        }
  
        if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
        skb_reset_network_header(skb);
        skb_probe_transport_header(skb, 0);
  
 +      if (generic_xdp) {
 +              struct bpf_prog *xdp_prog;
 +              int ret;
 +
 +              rcu_read_lock();
 +              xdp_prog = rcu_dereference(tun->xdp_prog);
 +              if (xdp_prog) {
 +                      ret = do_xdp_generic(xdp_prog, skb);
 +                      if (ret != XDP_PASS) {
 +                              rcu_read_unlock();
 +                              return total_len;
 +                      }
 +              }
 +              rcu_read_unlock();
 +      }
 +
        rxhash = __skb_get_hash_symmetric(skb);
  #ifndef CONFIG_4KSTACKS
        tun_rx_batched(tun, tfile, skb, more);
@@@ -2079,6 -1879,9 +2079,9 @@@ static int tun_set_iff(struct net *net
  
  err_detach:
        tun_detach_all(dev);
+       /* register_netdevice() already called tun_free_netdev() */
+       goto err_free_dev;
  err_free_flow:
        tun_flow_uninit(tun);
        security_tun_dev_free_security(tun->security);
@@@ -2121,6 -1924,11 +2124,6 @@@ static int set_offload(struct tun_struc
                                features |= NETIF_F_TSO6;
                        arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
                }
 -
 -              if (arg & TUN_F_UFO) {
 -                      features |= NETIF_F_UFO;
 -                      arg &= ~TUN_F_UFO;
 -              }
        }
  
        /* This gives the user a way to test for new features in future by
@@@ -2732,7 -2540,7 +2735,7 @@@ static int tun_queue_resize(struct tun_
        int n = tun->numqueues + tun->numdisabled;
        int ret, i;
  
 -      arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
 +      arrays = kmalloc_array(n, sizeof(*arrays), GFP_KERNEL);
        if (!arrays)
                return -ENOMEM;
  
diff --combined include/net/ip.h
index 39db596eb89fc346c549945482582f0abc89b6f0,0cf7f5a65fe6be2be30259aa5cd251d02de489d5..9896f46cbbf11235395d75a5ec18a14736ee099d
@@@ -78,16 -78,6 +78,16 @@@ struct ipcm_cookie 
  #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
  #define PKTINFO_SKB_CB(skb) ((struct in_pktinfo *)((skb)->cb))
  
 +/* return enslaved device index if relevant */
 +static inline int inet_sdif(struct sk_buff *skb)
 +{
 +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
 +      if (skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
 +              return IPCB(skb)->iif;
 +#endif
 +      return 0;
 +}
 +
  struct ip_ra_chain {
        struct ip_ra_chain __rcu *next;
        struct sock             *sk;
@@@ -362,7 -352,7 +362,7 @@@ static inline unsigned int ip_dst_mtu_m
            !forwarding)
                return dst_mtu(dst);
  
-       return min(dst->dev->mtu, IP_MAX_MTU);
+       return min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU);
  }
  
  static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
                return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding);
        }
  
-       return min(skb_dst(skb)->dev->mtu, IP_MAX_MTU);
+       return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU);
  }
  
  u32 ip_idents_reserve(u32 hash, int segs);
@@@ -577,12 -567,11 +577,12 @@@ int ip_forward(struct sk_buff *skb)
  void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
                      __be32 daddr, struct rtable *rt, int is_frag);
  
 -int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
 -                    const struct ip_options *sopt);
 -static inline int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
 +int __ip_options_echo(struct net *net, struct ip_options *dopt,
 +                    struct sk_buff *skb, const struct ip_options *sopt);
 +static inline int ip_options_echo(struct net *net, struct ip_options *dopt,
 +                                struct sk_buff *skb)
  {
 -      return __ip_options_echo(dopt, skb, &IPCB(skb)->opt);
 +      return __ip_options_echo(net, dopt, skb, &IPCB(skb)->opt);
  }
  
  void ip_options_fragment(struct sk_buff *skb);
index 107c5243224507144d53363b04a303ecf3397090,67f815e5d52517390226bc3531b1ea7b5f1020bc..1688f0f6c7ba9b4905682bbeb2417be8a79a98aa
@@@ -156,6 -156,7 +156,6 @@@ struct Qdisc_class_ops 
  
        /* Filter manipulation */
        struct tcf_block *      (*tcf_block)(struct Qdisc *, unsigned long);
 -      bool                    (*tcf_cl_offload)(u32 classid);
        unsigned long           (*bind_tcf)(struct Qdisc *, unsigned long,
                                        u32 classid);
        void                    (*unbind_tcf)(struct Qdisc *, unsigned long);
@@@ -212,16 -213,16 +212,16 @@@ struct tcf_proto_ops 
        int                     (*init)(struct tcf_proto*);
        void                    (*destroy)(struct tcf_proto*);
  
 -      unsigned long           (*get)(struct tcf_proto*, u32 handle);
 +      void*                   (*get)(struct tcf_proto*, u32 handle);
        int                     (*change)(struct net *net, struct sk_buff *,
                                        struct tcf_proto*, unsigned long,
                                        u32 handle, struct nlattr **,
 -                                      unsigned long *, bool);
 -      int                     (*delete)(struct tcf_proto*, unsigned long, bool*);
 +                                      void **, bool);
 +      int                     (*delete)(struct tcf_proto*, void *, bool*);
        void                    (*walk)(struct tcf_proto*, struct tcf_walker *arg);
  
        /* rtnetlink specific */
 -      int                     (*dump)(struct net*, struct tcf_proto*, unsigned long,
 +      int                     (*dump)(struct net*, struct tcf_proto*, void *,
                                        struct sk_buff *skb, struct tcmsg*);
  
        struct module           *owner;
@@@ -393,9 -394,6 +393,9 @@@ qdisc_class_find(const struct Qdisc_cla
        struct Qdisc_class_common *cl;
        unsigned int h;
  
 +      if (!id)
 +              return NULL;
 +
        h = qdisc_class_hash(id, hash->hashmask);
        hlist_for_each_entry(cl, &hash->hash[h], hnode) {
                if (cl->classid == id)
@@@ -808,8 -806,11 +808,11 @@@ static inline struct Qdisc *qdisc_repla
        old = *pold;
        *pold = new;
        if (old != NULL) {
-               qdisc_tree_reduce_backlog(old, old->q.qlen, old->qstats.backlog);
+               unsigned int qlen = old->q.qlen;
+               unsigned int backlog = old->qstats.backlog;
                qdisc_reset(old);
+               qdisc_tree_reduce_backlog(old, qlen, backlog);
        }
        sch_tree_unlock(sch);
  
diff --combined include/net/sock.h
index fe1a0bc25cd3e5a2ce8fb9965df7ac532d592bdb,aeeec62992ca7dc5ff80f8d7164a1c143f606b03..1c2912d433e81b10f3fdc87bcfcbb091570edc03
@@@ -294,7 -294,6 +294,7 @@@ struct sock_common 
    *   @sk_stamp: time stamp of last packet received
    *   @sk_tsflags: SO_TIMESTAMPING socket options
    *   @sk_tskey: counter to disambiguate concurrent tstamp requests
 +  *   @sk_zckey: counter to order MSG_ZEROCOPY notifications
    *   @sk_socket: Identd and reporting IO signals
    *   @sk_user_data: RPC layer private data
    *   @sk_frag: cached page frag
@@@ -463,7 -462,6 +463,7 @@@ struct sock 
        u16                     sk_tsflags;
        u8                      sk_shutdown;
        u32                     sk_tskey;
 +      atomic_t                sk_zckey;
        struct socket           *sk_socket;
        void                    *sk_user_data;
  #ifdef CONFIG_SECURITY
@@@ -509,9 -507,7 +509,7 @@@ int sk_set_peek_off(struct sock *sk, in
  static inline int sk_peek_offset(struct sock *sk, int flags)
  {
        if (unlikely(flags & MSG_PEEK)) {
-               s32 off = READ_ONCE(sk->sk_peek_off);
-               if (off >= 0)
-                       return off;
+               return READ_ONCE(sk->sk_peek_off);
        }
  
        return 0;
@@@ -1533,8 -1529,6 +1531,8 @@@ struct sk_buff *sock_wmalloc(struct soc
                             gfp_t priority);
  void __sock_wfree(struct sk_buff *skb);
  void sock_wfree(struct sk_buff *skb);
 +struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
 +                           gfp_t priority);
  void skb_orphan_partial(struct sk_buff *skb);
  void sock_rfree(struct sk_buff *skb);
  void sock_efree(struct sk_buff *skb);
@@@ -1586,14 -1580,11 +1584,14 @@@ int sock_no_shutdown(struct socket *, i
  int sock_no_getsockopt(struct socket *, int , int, char __user *, int __user *);
  int sock_no_setsockopt(struct socket *, int, int, char __user *, unsigned int);
  int sock_no_sendmsg(struct socket *, struct msghdr *, size_t);
 +int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len);
  int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int);
  int sock_no_mmap(struct file *file, struct socket *sock,
                 struct vm_area_struct *vma);
  ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset,
                         size_t size, int flags);
 +ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
 +                              int offset, size_t size, int flags);
  
  /*
   * Functions to fill in entries in struct proto_ops when a protocol
diff --combined kernel/events/core.c
index a7a6c1d19a4929f9879cbc2f1eef1b4434356af0,ee20d4c546b5ebc0248c084e11c483e0ef800c6f..8c01572709aca5c4144d30d2226e1deeab9e3e0e
@@@ -2217,6 -2217,33 +2217,33 @@@ static int group_can_go_on(struct perf_
        return can_add_hw;
  }
  
+ /*
+  * Complement to update_event_times(). This computes the tstamp_* values to
+  * continue 'enabled' state from @now, and effectively discards the time
+  * between the prior tstamp_stopped and now (as we were in the OFF state, or
+  * just switched (context) time base).
+  *
+  * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
+  * cannot have been scheduled in yet. And going into INACTIVE state means
+  * '@event->tstamp_stopped = @now'.
+  *
+  * Thus given the rules of update_event_times():
+  *
+  *   total_time_enabled = tstamp_stopped - tstamp_enabled
+  *   total_time_running = tstamp_stopped - tstamp_running
+  *
+  * We can insert 'tstamp_stopped == now' and reverse them to compute new
+  * tstamp_* values.
+  */
+ static void __perf_event_enable_time(struct perf_event *event, u64 now)
+ {
+       WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
+       event->tstamp_stopped = now;
+       event->tstamp_enabled = now - event->total_time_enabled;
+       event->tstamp_running = now - event->total_time_running;
+ }
  static void add_event_to_ctx(struct perf_event *event,
                               struct perf_event_context *ctx)
  {
  
        list_add_event(event, ctx);
        perf_group_attach(event);
-       event->tstamp_enabled = tstamp;
-       event->tstamp_running = tstamp;
-       event->tstamp_stopped = tstamp;
+       /*
+        * We can be called with event->state == STATE_OFF when we create with
+        * .disabled = 1. In that case the IOC_ENABLE will call this function.
+        */
+       if (event->state == PERF_EVENT_STATE_INACTIVE)
+               __perf_event_enable_time(event, tstamp);
  }
  
  static void ctx_sched_out(struct perf_event_context *ctx,
@@@ -2471,10 -2501,11 +2501,11 @@@ static void __perf_event_mark_enabled(s
        u64 tstamp = perf_event_time(event);
  
        event->state = PERF_EVENT_STATE_INACTIVE;
-       event->tstamp_enabled = tstamp - event->total_time_enabled;
+       __perf_event_enable_time(event, tstamp);
        list_for_each_entry(sub, &event->sibling_list, group_entry) {
+               /* XXX should not be > INACTIVE if event isn't */
                if (sub->state >= PERF_EVENT_STATE_INACTIVE)
-                       sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+                       __perf_event_enable_time(sub, tstamp);
        }
  }
  
@@@ -5090,7 -5121,7 +5121,7 @@@ static void perf_mmap_open(struct vm_ar
                atomic_inc(&event->rb->aux_mmap_count);
  
        if (event->pmu->event_mapped)
-               event->pmu->event_mapped(event);
+               event->pmu->event_mapped(event, vma->vm_mm);
  }
  
  static void perf_pmu_output_stop(struct perf_event *event);
@@@ -5113,7 -5144,7 +5144,7 @@@ static void perf_mmap_close(struct vm_a
        unsigned long size = perf_data_size(rb);
  
        if (event->pmu->event_unmapped)
-               event->pmu->event_unmapped(event);
+               event->pmu->event_unmapped(event, vma->vm_mm);
  
        /*
         * rb->aux_mmap_count will always drop before rb->mmap_count and
@@@ -5411,7 -5442,7 +5442,7 @@@ aux_unlock
        vma->vm_ops = &perf_mmap_vmops;
  
        if (event->pmu->event_mapped)
-               event->pmu->event_mapped(event);
+               event->pmu->event_mapped(event, vma->vm_mm);
  
        return ret;
  }
@@@ -8050,7 -8081,7 +8081,7 @@@ static void perf_event_free_bpf_handler
  
  static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
  {
 -      bool is_kprobe, is_tracepoint;
 +      bool is_kprobe, is_tracepoint, is_syscall_tp;
        struct bpf_prog *prog;
  
        if (event->attr.type != PERF_TYPE_TRACEPOINT)
  
        is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
        is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
 -      if (!is_kprobe && !is_tracepoint)
 +      is_syscall_tp = is_syscall_trace_event(event->tp_event);
 +      if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
                /* bpf programs can only be attached to u/kprobe or tracepoint */
                return -EINVAL;
  
                return PTR_ERR(prog);
  
        if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
 -          (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
 +          (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
 +          (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
                /* valid fd, but invalid bpf program type */
                bpf_prog_put(prog);
                return -EINVAL;
        }
  
 -      if (is_tracepoint) {
 +      if (is_tracepoint || is_syscall_tp) {
                int off = trace_event_get_offsets(event->tp_event);
  
                if (prog->aux->max_ctx_offset > off) {
diff --combined net/core/datagram.c
index 2f3277945d356bd13d51b213bfbca59f70c351c1,a21ca8dee5eadca0d9ab7c78a939ac90bb3963b0..a4d5f10d83a1ca6cf9bb1e8dc6d6faeae5947e4d
@@@ -169,14 -169,20 +169,20 @@@ struct sk_buff *__skb_try_recv_from_que
                                          int *peeked, int *off, int *err,
                                          struct sk_buff **last)
  {
+       bool peek_at_off = false;
        struct sk_buff *skb;
-       int _off = *off;
+       int _off = 0;
+       if (unlikely(flags & MSG_PEEK && *off >= 0)) {
+               peek_at_off = true;
+               _off = *off;
+       }
  
        *last = queue->prev;
        skb_queue_walk(queue, skb) {
                if (flags & MSG_PEEK) {
-                       if (_off >= skb->len && (skb->len || _off ||
-                                                skb->peeked)) {
+                       if (peek_at_off && _off >= skb->len &&
+                           (_off || skb->peeked)) {
                                _off -= skb->len;
                                continue;
                        }
@@@ -573,12 -579,27 +579,12 @@@ fault
  }
  EXPORT_SYMBOL(skb_copy_datagram_from_iter);
  
 -/**
 - *    zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
 - *    @skb: buffer to copy
 - *    @from: the source to copy from
 - *
 - *    The function will first copy up to headlen, and then pin the userspace
 - *    pages and build frags through them.
 - *
 - *    Returns 0, -EFAULT or -EMSGSIZE.
 - */
 -int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
 +int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
 +                          struct iov_iter *from, size_t length)
  {
 -      int len = iov_iter_count(from);
 -      int copy = min_t(int, skb_headlen(skb), len);
 -      int frag = 0;
 +      int frag = skb_shinfo(skb)->nr_frags;
  
 -      /* copy up to skb headlen */
 -      if (skb_copy_datagram_from_iter(skb, 0, from, copy))
 -              return -EFAULT;
 -
 -      while (iov_iter_count(from)) {
 +      while (length && iov_iter_count(from)) {
                struct page *pages[MAX_SKB_FRAGS];
                size_t start;
                ssize_t copied;
                if (frag == MAX_SKB_FRAGS)
                        return -EMSGSIZE;
  
 -              copied = iov_iter_get_pages(from, pages, ~0U,
 +              copied = iov_iter_get_pages(from, pages, length,
                                            MAX_SKB_FRAGS - frag, &start);
                if (copied < 0)
                        return -EFAULT;
  
                iov_iter_advance(from, copied);
 +              length -= copied;
  
                truesize = PAGE_ALIGN(copied + start);
                skb->data_len += copied;
                skb->len += copied;
                skb->truesize += truesize;
 -              refcount_add(truesize, &skb->sk->sk_wmem_alloc);
 +              if (sk && sk->sk_type == SOCK_STREAM) {
 +                      sk->sk_wmem_queued += truesize;
 +                      sk_mem_charge(sk, truesize);
 +              } else {
 +                      refcount_add(truesize, &skb->sk->sk_wmem_alloc);
 +              }
                while (copied) {
                        int size = min_t(int, copied, PAGE_SIZE - start);
                        skb_fill_page_desc(skb, frag++, pages[n], start, size);
        }
        return 0;
  }
 +EXPORT_SYMBOL(__zerocopy_sg_from_iter);
 +
 +/**
 + *    zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
 + *    @skb: buffer to copy
 + *    @from: the source to copy from
 + *
 + *    The function will first copy up to headlen, and then pin the userspace
 + *    pages and build frags through them.
 + *
 + *    Returns 0, -EFAULT or -EMSGSIZE.
 + */
 +int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
 +{
 +      int copy = min_t(int, skb_headlen(skb), iov_iter_count(from));
 +
 +      /* copy up to skb headlen */
 +      if (skb_copy_datagram_from_iter(skb, 0, from, copy))
 +              return -EFAULT;
 +
 +      return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
 +}
  EXPORT_SYMBOL(zerocopy_sg_from_iter);
  
  static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
diff --combined net/ipv4/igmp.c
index 9f86b5133605c90c58f670460994cf9986f87339,caf2f1101d027b7b6e8d9683887e16c7bd4a8438..ab183af0b5b6a8f9b7fd02b32b56d32487518f7a
@@@ -1007,10 -1007,18 +1007,18 @@@ int igmp_rcv(struct sk_buff *skb
  {
        /* This basically follows the spec line by line -- see RFC1112 */
        struct igmphdr *ih;
-       struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+       struct net_device *dev = skb->dev;
+       struct in_device *in_dev;
        int len = skb->len;
        bool dropped = true;
  
+       if (netif_is_l3_master(dev)) {
+               dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif);
+               if (!dev)
+                       goto drop;
+       }
+       in_dev = __in_dev_get_rcu(dev);
        if (!in_dev)
                goto drop;
  
@@@ -2549,8 -2557,7 +2557,8 @@@ done
  /*
   * check if a multicast source filter allows delivery for a given <src,dst,intf>
   */
 -int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
 +int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
 +                 int dif, int sdif)
  {
        struct inet_sock *inet = inet_sk(sk);
        struct ip_mc_socklist *pmc;
        rcu_read_lock();
        for_each_pmc_rcu(inet, pmc) {
                if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
 -                  pmc->multi.imr_ifindex == dif)
 +                  (pmc->multi.imr_ifindex == dif ||
 +                   (sdif && pmc->multi.imr_ifindex == sdif)))
                        break;
        }
        ret = inet->mc_all;
diff --combined net/ipv4/route.c
index 872b4cb136d3fa0cda403836cc83a156a65310a3,2331de20ca505d7f25fe9d93d5320e9e39af6c39..94d4cd2d5ea4f4589783528d8e951d3365078bc6
@@@ -1267,7 -1267,7 +1267,7 @@@ static unsigned int ipv4_mtu(const stru
        if (mtu)
                return mtu;
  
-       mtu = dst->dev->mtu;
+       mtu = READ_ONCE(dst->dev->mtu);
  
        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
                if (rt->rt_uses_gateway && mtu > 576)
@@@ -1398,7 -1398,7 +1398,7 @@@ static void ipv4_dst_destroy(struct dst
        struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
        struct rtable *rt = (struct rtable *) dst;
  
 -      if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
 +      if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
                kfree(p);
  
        if (!list_empty(&rt->rt_uncached)) {
@@@ -1456,7 -1456,7 +1456,7 @@@ static void rt_set_nexthop(struct rtabl
                dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
                if (fi->fib_metrics != &dst_default_metrics) {
                        rt->dst._metrics |= DST_METRICS_REFCOUNTED;
 -                      atomic_inc(&fi->fib_metrics->refcnt);
 +                      refcount_inc(&fi->fib_metrics->refcnt);
                }
  #ifdef CONFIG_IP_ROUTE_CLASSID
                rt->dst.tclassid = nh->nh_tclassid;
@@@ -2236,7 -2236,7 +2236,7 @@@ add
        if (!rth)
                return ERR_PTR(-ENOBUFS);
  
 -      rth->rt_iif     = orig_oif ? : 0;
 +      rth->rt_iif = orig_oif;
        if (res->table)
                rth->rt_table_id = res->table->tb_id;
  
@@@ -2439,12 -2439,6 +2439,12 @@@ struct rtable *ip_route_output_key_hash
                /* L3 master device is the loopback for that domain */
                dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
                        net->loopback_dev;
 +
 +              /* make sure orig_oif points to fib result device even
 +               * though packet rx/tx happens over loopback or l3mdev
 +               */
 +              orig_oif = FIB_RES_OIF(*res);
 +
                fl4->flowi4_oif = dev_out->ifindex;
                flags |= RTCF_LOCAL;
                goto make_route;
@@@ -2769,14 -2763,21 +2769,21 @@@ static int inet_rtm_getroute(struct sk_
        if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
                table_id = rt->rt_table_id;
  
-       if (rtm->rtm_flags & RTM_F_FIB_MATCH)
+       if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
+               if (!res.fi) {
+                       err = fib_props[res.type].error;
+                       if (!err)
+                               err = -EHOSTUNREACH;
+                       goto errout_free;
+               }
                err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
                                    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
                                    rt->rt_type, res.prefix, res.prefixlen,
                                    fl4.flowi4_tos, res.fi, 0);
-       else
+       } else {
                err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
                                   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
+       }
        if (err < 0)
                goto errout_free;
  
@@@ -3074,8 -3075,7 +3081,8 @@@ int __init ip_rt_init(void
        xfrm_init();
        xfrm4_init();
  #endif
 -      rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
 +      rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
 +                    RTNL_FLAG_DOIT_UNLOCKED);
  
  #ifdef CONFIG_SYSCTL
        register_pernet_subsys(&sysctl_route_ops);
diff --combined net/ipv4/tcp_input.c
index d73903fe8c83d68ee005c1bb9136d9ab008469fd,bab7f0493098c6521f445923d721d296f326f7e1..ddc854728a6011cc36d82ef0e226c0fbbbec339d
@@@ -103,6 -103,7 +103,6 @@@ int sysctl_tcp_invalid_ratelimit __read
  #define FLAG_DATA_SACKED      0x20 /* New SACK.                               */
  #define FLAG_ECE              0x40 /* ECE in this ACK                         */
  #define FLAG_LOST_RETRANS     0x80 /* This ACK marks some retransmission lost */
 -#define FLAG_SLOWPATH         0x100 /* Do not skip RFC checks for window update.*/
  #define FLAG_ORIG_SACK_ACKED  0x200 /* Never retransmitted data are (s)acked  */
  #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
  #define FLAG_DSACKING_ACK     0x800 /* SACK blocks contained D-SACK info */
@@@ -1951,7 -1952,6 +1951,7 @@@ void tcp_enter_loss(struct sock *sk
            !after(tp->high_seq, tp->snd_una) ||
            (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
                tp->prior_ssthresh = tcp_current_ssthresh(sk);
 +              tp->prior_cwnd = tp->snd_cwnd;
                tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
                tcp_ca_event(sk, CA_EVENT_LOSS);
                tcp_init_undo(tp);
@@@ -3009,8 -3009,7 +3009,7 @@@ void tcp_rearm_rto(struct sock *sk
                        /* delta_us may not be positive if the socket is locked
                         * when the retrans timer fires and is rescheduled.
                         */
-                       if (delta_us > 0)
-                               rto = usecs_to_jiffies(delta_us);
+                       rto = usecs_to_jiffies(max_t(int, delta_us, 1));
                }
                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
                                          TCP_RTO_MAX);
@@@ -3372,6 -3371,12 +3371,6 @@@ static int tcp_ack_update_window(struc
                if (tp->snd_wnd != nwin) {
                        tp->snd_wnd = nwin;
  
 -                      /* Note, it is the only place, where
 -                       * fast path is recovered for sending TCP.
 -                       */
 -                      tp->pred_flags = 0;
 -                      tcp_fast_path_check(sk);
 -
                        if (tcp_send_head(sk))
                                tcp_slow_start_after_idle_check(sk);
  
@@@ -3553,7 -3558,6 +3552,7 @@@ static int tcp_ack(struct sock *sk, con
        u32 lost = tp->lost;
        int acked = 0; /* Number of packets newly acked */
        int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
 +      u32 ack_ev_flags = 0;
  
        sack_state.first_sackt = 0;
        sack_state.rate = &rs;
        if (flag & FLAG_UPDATE_TS_RECENT)
                tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
  
 -      if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
 -              /* Window is constant, pure forward advance.
 -               * No more checks are required.
 -               * Note, we use the fact that SND.UNA>=SND.WL2.
 -               */
 -              tcp_update_wl(tp, ack_seq);
 -              tcp_snd_una_update(tp, ack);
 -              flag |= FLAG_WIN_UPDATE;
 -
 -              tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
 -
 -              NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
 -      } else {
 -              u32 ack_ev_flags = CA_ACK_SLOWPATH;
 -
 -              if (ack_seq != TCP_SKB_CB(skb)->end_seq)
 -                      flag |= FLAG_DATA;
 -              else
 -                      NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
 +      if (ack_seq != TCP_SKB_CB(skb)->end_seq)
 +              flag |= FLAG_DATA;
 +      else
 +              NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
  
 -              flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
 +      flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
  
 -              if (TCP_SKB_CB(skb)->sacked)
 -                      flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
 -                                                      &sack_state);
 +      if (TCP_SKB_CB(skb)->sacked)
 +              flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
 +                                              &sack_state);
  
 -              if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
 -                      flag |= FLAG_ECE;
 -                      ack_ev_flags |= CA_ACK_ECE;
 -              }
 +      if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
 +              flag |= FLAG_ECE;
 +              ack_ev_flags = CA_ACK_ECE;
 +      }
  
 -              if (flag & FLAG_WIN_UPDATE)
 -                      ack_ev_flags |= CA_ACK_WIN_UPDATE;
 +      if (flag & FLAG_WIN_UPDATE)
 +              ack_ev_flags |= CA_ACK_WIN_UPDATE;
  
 -              tcp_in_ack_event(sk, ack_ev_flags);
 -      }
 +      tcp_in_ack_event(sk, ack_ev_flags);
  
        /* We passed data and got it acked, remove any soft error
         * log. Something worked...
@@@ -4381,6 -4401,8 +4380,6 @@@ static void tcp_data_queue_ofo(struct s
                return;
        }
  
 -      /* Disable header prediction. */
 -      tp->pred_flags = 0;
        inet_csk_schedule_ack(sk);
  
        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
@@@ -4569,8 -4591,8 +4568,8 @@@ err
  static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
  {
        struct tcp_sock *tp = tcp_sk(sk);
 -      bool fragstolen = false;
 -      int eaten = -1;
 +      bool fragstolen;
 +      int eaten;
  
        if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
                __kfree_skb(skb);
                        goto out_of_window;
  
                /* Ok. In sequence. In window. */
 -              if (tp->ucopy.task == current &&
 -                  tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
 -                  sock_owned_by_user(sk) && !tp->urg_data) {
 -                      int chunk = min_t(unsigned int, skb->len,
 -                                        tp->ucopy.len);
 -
 -                      __set_current_state(TASK_RUNNING);
 -
 -                      if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
 -                              tp->ucopy.len -= chunk;
 -                              tp->copied_seq += chunk;
 -                              eaten = (chunk == skb->len);
 -                              tcp_rcv_space_adjust(sk);
 -                      }
 -              }
 -
 -              if (eaten <= 0) {
  queue_and_out:
 -                      if (eaten < 0) {
 -                              if (skb_queue_len(&sk->sk_receive_queue) == 0)
 -                                      sk_forced_mem_schedule(sk, skb->truesize);
 -                              else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
 -                                      goto drop;
 -                      }
 -                      eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
 -              }
 +              if (skb_queue_len(&sk->sk_receive_queue) == 0)
 +                      sk_forced_mem_schedule(sk, skb->truesize);
 +              else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
 +                      goto drop;
 +
 +              eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
                tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
                if (skb->len)
                        tcp_event_data_recv(sk, skb);
                if (tp->rx_opt.num_sacks)
                        tcp_sack_remove(tp);
  
 -              tcp_fast_path_check(sk);
 -
                if (eaten > 0)
                        kfree_skb_partial(skb, fragstolen);
                if (!sock_flag(sk, SOCK_DEAD))
@@@ -4943,6 -4986,7 +4942,6 @@@ static int tcp_prune_queue(struct sock 
        NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
  
        /* Massive buffer overcommit. */
 -      tp->pred_flags = 0;
        return -1;
  }
  
@@@ -5114,6 -5158,9 +5113,6 @@@ static void tcp_check_urg(struct sock *
  
        tp->urg_data = TCP_URG_NOTYET;
        tp->urg_seq = ptr;
 -
 -      /* Disable header prediction. */
 -      tp->pred_flags = 0;
  }
  
  /* This is the 'fast' part of urgent handling. */
@@@ -5142,6 -5189,26 +5141,6 @@@ static void tcp_urg(struct sock *sk, st
        }
  }
  
 -static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
 -{
 -      struct tcp_sock *tp = tcp_sk(sk);
 -      int chunk = skb->len - hlen;
 -      int err;
 -
 -      if (skb_csum_unnecessary(skb))
 -              err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
 -      else
 -              err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg);
 -
 -      if (!err) {
 -              tp->ucopy.len -= chunk;
 -              tp->copied_seq += chunk;
 -              tcp_rcv_space_adjust(sk);
 -      }
 -
 -      return err;
 -}
 -
  /* Accept RST for rcv_nxt - 1 after a FIN.
   * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
   * FIN is sent followed by a RST packet. The RST is sent with the same
@@@ -5272,29 -5339,201 +5271,29 @@@ discard
  
  /*
   *    TCP receive function for the ESTABLISHED state.
 - *
 - *    It is split into a fast path and a slow path. The fast path is
 - *    disabled when:
 - *    - A zero window was announced from us - zero window probing
 - *        is only handled properly in the slow path.
 - *    - Out of order segments arrived.
 - *    - Urgent data is expected.
 - *    - There is no buffer space left
 - *    - Unexpected TCP flags/window values/header lengths are received
 - *      (detected by checking the TCP header against pred_flags)
 - *    - Data is sent in both directions. Fast path only supports pure senders
 - *      or pure receivers (this means either the sequence number or the ack
 - *      value must stay constant)
 - *    - Unexpected TCP option.
 - *
 - *    When these conditions are not satisfied it drops into a standard
 - *    receive procedure patterned after RFC793 to handle all cases.
 - *    The first three cases are guaranteed by proper pred_flags setting,
 - *    the rest is checked inline. Fast processing is turned on in
 - *    tcp_data_queue when everything is OK.
   */
  void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 -                       const struct tcphdr *th, unsigned int len)
 +                       const struct tcphdr *th)
  {
 +      unsigned int len = skb->len;
        struct tcp_sock *tp = tcp_sk(sk);
  
        tcp_mstamp_refresh(tp);
        if (unlikely(!sk->sk_rx_dst))
                inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
 -      /*
 -       *      Header prediction.
 -       *      The code loosely follows the one in the famous
 -       *      "30 instruction TCP receive" Van Jacobson mail.
 -       *
 -       *      Van's trick is to deposit buffers into socket queue
 -       *      on a device interrupt, to call tcp_recv function
 -       *      on the receive process context and checksum and copy
 -       *      the buffer to user space. smart...
 -       *
 -       *      Our current scheme is not silly either but we take the
 -       *      extra cost of the net_bh soft interrupt processing...
 -       *      We do checksum and copy also but from device to kernel.
 -       */
  
        tp->rx_opt.saw_tstamp = 0;
  
 -      /*      pred_flags is 0xS?10 << 16 + snd_wnd
 -       *      if header_prediction is to be made
 -       *      'S' will always be tp->tcp_header_len >> 2
 -       *      '?' will be 0 for the fast path, otherwise pred_flags is 0 to
 -       *  turn it off (when there are holes in the receive
 -       *       space for instance)
 -       *      PSH flag is ignored.
 -       */
 -
 -      if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
 -          TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
 -          !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
 -              int tcp_header_len = tp->tcp_header_len;
 -
 -              /* Timestamp header prediction: tcp_header_len
 -               * is automatically equal to th->doff*4 due to pred_flags
 -               * match.
 -               */
 -
 -              /* Check timestamp */
 -              if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
 -                      /* No? Slow path! */
 -                      if (!tcp_parse_aligned_timestamp(tp, th))
 -                              goto slow_path;
 -
 -                      /* If PAWS failed, check it more carefully in slow path */
 -                      if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
 -                              goto slow_path;
 -
 -                      /* DO NOT update ts_recent here, if checksum fails
 -                       * and timestamp was corrupted part, it will result
 -                       * in a hung connection since we will drop all
 -                       * future packets due to the PAWS test.
 -                       */
 -              }
 -
 -              if (len <= tcp_header_len) {
 -                      /* Bulk data transfer: sender */
 -                      if (len == tcp_header_len) {
 -                              /* Predicted packet is in window by definition.
 -                               * seq == rcv_nxt and rcv_wup <= rcv_nxt.
 -                               * Hence, check seq<=rcv_wup reduces to:
 -                               */
 -                              if (tcp_header_len ==
 -                                  (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
 -                                  tp->rcv_nxt == tp->rcv_wup)
 -                                      tcp_store_ts_recent(tp);
 -
 -                              /* We know that such packets are checksummed
 -                               * on entry.
 -                               */
 -                              tcp_ack(sk, skb, 0);
 -                              __kfree_skb(skb);
 -                              tcp_data_snd_check(sk);
 -                              return;
 -                      } else { /* Header too small */
 -                              TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
 -                              goto discard;
 -                      }
 -              } else {
 -                      int eaten = 0;
 -                      bool fragstolen = false;
 -
 -                      if (tp->ucopy.task == current &&
 -                          tp->copied_seq == tp->rcv_nxt &&
 -                          len - tcp_header_len <= tp->ucopy.len &&
 -                          sock_owned_by_user(sk)) {
 -                              __set_current_state(TASK_RUNNING);
 -
 -                              if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
 -                                      /* Predicted packet is in window by definition.
 -                                       * seq == rcv_nxt and rcv_wup <= rcv_nxt.
 -                                       * Hence, check seq<=rcv_wup reduces to:
 -                                       */
 -                                      if (tcp_header_len ==
 -                                          (sizeof(struct tcphdr) +
 -                                           TCPOLEN_TSTAMP_ALIGNED) &&
 -                                          tp->rcv_nxt == tp->rcv_wup)
 -                                              tcp_store_ts_recent(tp);
 -
 -                                      tcp_rcv_rtt_measure_ts(sk, skb);
 -
 -                                      __skb_pull(skb, tcp_header_len);
 -                                      tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
 -                                      NET_INC_STATS(sock_net(sk),
 -                                                      LINUX_MIB_TCPHPHITSTOUSER);
 -                                      eaten = 1;
 -                              }
 -                      }
 -                      if (!eaten) {
 -                              if (tcp_checksum_complete(skb))
 -                                      goto csum_error;
 -
 -                              if ((int)skb->truesize > sk->sk_forward_alloc)
 -                                      goto step5;
 -
 -                              /* Predicted packet is in window by definition.
 -                               * seq == rcv_nxt and rcv_wup <= rcv_nxt.
 -                               * Hence, check seq<=rcv_wup reduces to:
 -                               */
 -                              if (tcp_header_len ==
 -                                  (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
 -                                  tp->rcv_nxt == tp->rcv_wup)
 -                                      tcp_store_ts_recent(tp);
 -
 -                              tcp_rcv_rtt_measure_ts(sk, skb);
 -
 -                              NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
 -
 -                              /* Bulk data transfer: receiver */
 -                              eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
 -                                                    &fragstolen);
 -                      }
 -
 -                      tcp_event_data_recv(sk, skb);
 -
 -                      if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
 -                              /* Well, only one small jumplet in fast path... */
 -                              tcp_ack(sk, skb, FLAG_DATA);
 -                              tcp_data_snd_check(sk);
 -                              if (!inet_csk_ack_scheduled(sk))
 -                                      goto no_ack;
 -                      }
 -
 -                      __tcp_ack_snd_check(sk, 0);
 -no_ack:
 -                      if (eaten)
 -                              kfree_skb_partial(skb, fragstolen);
 -                      sk->sk_data_ready(sk);
 -                      return;
 -              }
 -      }
 -
 -slow_path:
        if (len < (th->doff << 2) || tcp_checksum_complete(skb))
                goto csum_error;
  
        if (!th->ack && !th->rst && !th->syn)
                goto discard;
  
 -      /*
 -       *      Standard slow path.
 -       */
 -
        if (!tcp_validate_incoming(sk, skb, th, 1))
                return;
  
 -step5:
 -      if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
 +      if (tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT) < 0)
                goto discard;
  
        tcp_rcv_rtt_measure_ts(sk, skb);
@@@ -5347,6 -5586,12 +5346,6 @@@ void tcp_finish_connect(struct sock *sk
  
        if (sock_flag(sk, SOCK_KEEPOPEN))
                inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
 -
 -      if (!tp->rx_opt.snd_wscale)
 -              __tcp_fast_path_on(tp, tp->snd_wnd);
 -      else
 -              tp->pred_flags = 0;
 -
  }
  
  static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
@@@ -5475,7 -5720,7 +5474,7 @@@ static int tcp_rcv_synsent_state_proces
                tcp_ecn_rcv_synack(tp, th);
  
                tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
 -              tcp_ack(sk, skb, FLAG_SLOWPATH);
 +              tcp_ack(sk, skb, 0);
  
                /* Ok.. it's good. Set up sequence numbers and
                 * move to established.
@@@ -5711,8 -5956,8 +5710,8 @@@ int tcp_rcv_state_process(struct sock *
                return 0;
  
        /* step 5: check the ACK field */
 -      acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
 -                                    FLAG_UPDATE_TS_RECENT |
 +
 +      acceptable = tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT |
                                      FLAG_NO_CHALLENGE_ACK) > 0;
  
        if (!acceptable) {
                tp->lsndtime = tcp_jiffies32;
  
                tcp_initialize_rcv_mss(sk);
 -              tcp_fast_path_on(tp);
                break;
  
        case TCP_FIN_WAIT1: {
diff --combined net/ipv4/udp.c
index cb633884e8259ad6620686ce773cb8ee6e9266de,cd1d044a7fa580f315af0fd81eb1bf425fd1f38c..25fb14490d6a9d47811f1cef17095b8cb1c27332
@@@ -380,8 -380,8 +380,8 @@@ int udp_v4_get_port(struct sock *sk, un
  
  static int compute_score(struct sock *sk, struct net *net,
                         __be32 saddr, __be16 sport,
 -                       __be32 daddr, unsigned short hnum, int dif,
 -                       bool exact_dif)
 +                       __be32 daddr, unsigned short hnum,
 +                       int dif, int sdif, bool exact_dif)
  {
        int score;
        struct inet_sock *inet;
        }
  
        if (sk->sk_bound_dev_if || exact_dif) {
 -              if (sk->sk_bound_dev_if != dif)
 +              bool dev_match = (sk->sk_bound_dev_if == dif ||
 +                                sk->sk_bound_dev_if == sdif);
 +
 +              if (exact_dif && !dev_match)
                        return -1;
 -              score += 4;
 +              if (sk->sk_bound_dev_if && dev_match)
 +                      score += 4;
        }
 +
        if (sk->sk_incoming_cpu == raw_smp_processor_id())
                score++;
        return score;
@@@ -441,11 -436,10 +441,11 @@@ static u32 udp_ehashfn(const struct ne
  
  /* called with rcu_read_lock() */
  static struct sock *udp4_lib_lookup2(struct net *net,
 -              __be32 saddr, __be16 sport,
 -              __be32 daddr, unsigned int hnum, int dif, bool exact_dif,
 -              struct udp_hslot *hslot2,
 -              struct sk_buff *skb)
 +                                   __be32 saddr, __be16 sport,
 +                                   __be32 daddr, unsigned int hnum,
 +                                   int dif, int sdif, bool exact_dif,
 +                                   struct udp_hslot *hslot2,
 +                                   struct sk_buff *skb)
  {
        struct sock *sk, *result;
        int score, badness, matches = 0, reuseport = 0;
        badness = 0;
        udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
                score = compute_score(sk, net, saddr, sport,
 -                                    daddr, hnum, dif, exact_dif);
 +                                    daddr, hnum, dif, sdif, exact_dif);
                if (score > badness) {
                        reuseport = sk->sk_reuseport;
                        if (reuseport) {
   * harder than this. -DaveM
   */
  struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 -              __be16 sport, __be32 daddr, __be16 dport,
 -              int dif, struct udp_table *udptable, struct sk_buff *skb)
 +              __be16 sport, __be32 daddr, __be16 dport, int dif,
 +              int sdif, struct udp_table *udptable, struct sk_buff *skb)
  {
        struct sock *sk, *result;
        unsigned short hnum = ntohs(dport);
                        goto begin;
  
                result = udp4_lib_lookup2(net, saddr, sport,
 -                                        daddr, hnum, dif,
 +                                        daddr, hnum, dif, sdif,
                                          exact_dif, hslot2, skb);
                if (!result) {
                        unsigned int old_slot2 = slot2;
                                goto begin;
  
                        result = udp4_lib_lookup2(net, saddr, sport,
 -                                                daddr, hnum, dif,
 +                                                daddr, hnum, dif, sdif,
                                                  exact_dif, hslot2, skb);
                }
                return result;
@@@ -527,7 -521,7 +527,7 @@@ begin
        badness = 0;
        sk_for_each_rcu(sk, &hslot->head) {
                score = compute_score(sk, net, saddr, sport,
 -                                    daddr, hnum, dif, exact_dif);
 +                                    daddr, hnum, dif, sdif, exact_dif);
                if (score > badness) {
                        reuseport = sk->sk_reuseport;
                        if (reuseport) {
@@@ -560,7 -554,7 +560,7 @@@ static inline struct sock *__udp4_lib_l
  
        return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
                                 iph->daddr, dport, inet_iif(skb),
 -                               udptable, skb);
 +                               inet_sdif(skb), udptable, skb);
  }
  
  struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
@@@ -582,7 -576,7 +582,7 @@@ struct sock *udp4_lib_lookup(struct ne
        struct sock *sk;
  
        sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
 -                             dif, &udp_table, NULL);
 +                             dif, 0, &udp_table, NULL);
        if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
                sk = NULL;
        return sk;
@@@ -593,7 -587,7 +593,7 @@@ EXPORT_SYMBOL_GPL(udp4_lib_lookup)
  static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
                                       __be16 loc_port, __be32 loc_addr,
                                       __be16 rmt_port, __be32 rmt_addr,
 -                                     int dif, unsigned short hnum)
 +                                     int dif, int sdif, unsigned short hnum)
  {
        struct inet_sock *inet = inet_sk(sk);
  
            (inet->inet_dport != rmt_port && inet->inet_dport) ||
            (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
            ipv6_only_sock(sk) ||
 -          (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
 +          (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
 +           sk->sk_bound_dev_if != sdif))
                return false;
 -      if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif))
 +      if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif))
                return false;
        return true;
  }
@@@ -635,8 -628,8 +635,8 @@@ void __udp4_lib_err(struct sk_buff *skb
        struct net *net = dev_net(skb->dev);
  
        sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
 -                      iph->saddr, uh->source, skb->dev->ifindex, udptable,
 -                      NULL);
 +                             iph->saddr, uh->source, skb->dev->ifindex, 0,
 +                             udptable, NULL);
        if (!sk) {
                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
                return; /* No socket for error */
@@@ -1183,11 -1176,7 +1183,11 @@@ static void udp_set_dev_scratch(struct 
        scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
        scratch->is_linear = !skb_is_nonlinear(skb);
  #endif
 -      if (likely(!skb->_skb_refdst))
 +      /* all head states execept sp (dst, sk, nf) are always cleared by
 +       * udp_rcv() and we need to preserve secpath, if present, to eventually
 +       * process IP_CMSG_PASSSEC at recvmsg() time
 +       */
 +      if (likely(!skb_sec_path(skb)))
                scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
  }
  
@@@ -1585,7 -1574,8 +1585,8 @@@ int udp_recvmsg(struct sock *sk, struc
                return ip_recv_error(sk, msg, len, addr_len);
  
  try_again:
-       peeking = off = sk_peek_offset(sk, flags);
+       peeking = flags & MSG_PEEK;
+       off = sk_peek_offset(sk, flags);
        skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
        if (!skb)
                return err;
@@@ -1793,6 -1783,13 +1794,6 @@@ static int __udp_queue_rcv_skb(struct s
                sk_mark_napi_id_once(sk, skb);
        }
  
 -      /* At recvmsg() time we may access skb->dst or skb->sp depending on
 -       * the IP options and the cmsg flags, elsewhere can we clear all
 -       * pending head states while they are hot in the cache
 -       */
 -      if (likely(IPCB(skb)->opt.optlen == 0 && !skb_sec_path(skb)))
 -              skb_release_head_state(skb);
 -
        rc = __udp_enqueue_schedule_skb(sk, skb);
        if (rc < 0) {
                int is_udplite = IS_UDPLITE(sk);
@@@ -1960,7 -1957,6 +1961,7 @@@ static int __udp4_lib_mcast_deliver(str
        unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
        unsigned int offset = offsetof(typeof(*sk), sk_node);
        int dif = skb->dev->ifindex;
 +      int sdif = inet_sdif(skb);
        struct hlist_node *node;
        struct sk_buff *nskb;
  
@@@ -1975,7 -1971,7 +1976,7 @@@ start_lookup
  
        sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
                if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
 -                                       uh->source, saddr, dif, hnum))
 +                                       uh->source, saddr, dif, sdif, hnum))
                        continue;
  
                if (!first) {
@@@ -2165,7 -2161,7 +2166,7 @@@ drop
  static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
                                                  __be16 loc_port, __be32 loc_addr,
                                                  __be16 rmt_port, __be32 rmt_addr,
 -                                                int dif)
 +                                                int dif, int sdif)
  {
        struct sock *sk, *result;
        unsigned short hnum = ntohs(loc_port);
        result = NULL;
        sk_for_each_rcu(sk, &hslot->head) {
                if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
 -                                      rmt_port, rmt_addr, dif, hnum)) {
 +                                      rmt_port, rmt_addr, dif, sdif, hnum)) {
                        if (result)
                                return NULL;
                        result = sk;
  static struct sock *__udp4_lib_demux_lookup(struct net *net,
                                            __be16 loc_port, __be32 loc_addr,
                                            __be16 rmt_port, __be32 rmt_addr,
 -                                          int dif)
 +                                          int dif, int sdif)
  {
        unsigned short hnum = ntohs(loc_port);
        unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
  
        udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
                if (INET_MATCH(sk, net, acookie, rmt_addr,
 -                             loc_addr, ports, dif))
 +                             loc_addr, ports, dif, sdif))
                        return sk;
                /* Only check first socket in chain */
                break;
@@@ -2224,7 -2220,6 +2225,7 @@@ void udp_v4_early_demux(struct sk_buff 
        struct sock *sk = NULL;
        struct dst_entry *dst;
        int dif = skb->dev->ifindex;
 +      int sdif = inet_sdif(skb);
        int ours;
  
        /* validate the packet */
                }
  
                sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
 -                                                 uh->source, iph->saddr, dif);
 +                                                 uh->source, iph->saddr,
 +                                                 dif, sdif);
        } else if (skb->pkt_type == PACKET_HOST) {
                sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
 -                                           uh->source, iph->saddr, dif);
 +                                           uh->source, iph->saddr, dif, sdif);
        }
  
        if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
diff --combined net/ipv6/ip6_fib.c
index 8c58c7558de003bb47e9c50c7f4fcfc0b7251bd5,5cc0ea0381981b0539d5d6e67401d962a6f6a230..549aacc3cb2c6f803a19d97e295ceac56ce6ef44
@@@ -33,7 -33,6 +33,7 @@@
  #include <net/ndisc.h>
  #include <net/addrconf.h>
  #include <net/lwtunnel.h>
 +#include <net/fib_notifier.h>
  
  #include <net/ip6_fib.h>
  #include <net/ip6_route.h>
@@@ -154,7 -153,7 +154,7 @@@ static void node_free(struct fib6_node 
        kmem_cache_free(fib6_node_kmem, fn);
  }
  
 -static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
 +void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
  {
        int cpu;
  
        free_percpu(non_pcpu_rt->rt6i_pcpu);
        non_pcpu_rt->rt6i_pcpu = NULL;
  }
 -
 -static void rt6_release(struct rt6_info *rt)
 -{
 -      if (atomic_dec_and_test(&rt->rt6i_ref)) {
 -              rt6_free_pcpu(rt);
 -              dst_dev_put(&rt->dst);
 -              dst_release(&rt->dst);
 -      }
 -}
 +EXPORT_SYMBOL_GPL(rt6_free_pcpu);
  
  static void fib6_link_table(struct net *net, struct fib6_table *tb)
  {
@@@ -295,109 -302,6 +295,109 @@@ static void __net_init fib6_tables_init
  
  #endif
  
 +unsigned int fib6_tables_seq_read(struct net *net)
 +{
 +      unsigned int h, fib_seq = 0;
 +
 +      rcu_read_lock();
 +      for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
 +              struct hlist_head *head = &net->ipv6.fib_table_hash[h];
 +              struct fib6_table *tb;
 +
 +              hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
 +                      read_lock_bh(&tb->tb6_lock);
 +                      fib_seq += tb->fib_seq;
 +                      read_unlock_bh(&tb->tb6_lock);
 +              }
 +      }
 +      rcu_read_unlock();
 +
 +      return fib_seq;
 +}
 +
 +static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
 +                                  enum fib_event_type event_type,
 +                                  struct rt6_info *rt)
 +{
 +      struct fib6_entry_notifier_info info = {
 +              .rt = rt,
 +      };
 +
 +      return call_fib6_notifier(nb, net, event_type, &info.info);
 +}
 +
 +static int call_fib6_entry_notifiers(struct net *net,
 +                                   enum fib_event_type event_type,
 +                                   struct rt6_info *rt)
 +{
 +      struct fib6_entry_notifier_info info = {
 +              .rt = rt,
 +      };
 +
 +      rt->rt6i_table->fib_seq++;
 +      return call_fib6_notifiers(net, event_type, &info.info);
 +}
 +
 +struct fib6_dump_arg {
 +      struct net *net;
 +      struct notifier_block *nb;
 +};
 +
 +static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg)
 +{
 +      if (rt == arg->net->ipv6.ip6_null_entry)
 +              return;
 +      call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
 +}
 +
 +static int fib6_node_dump(struct fib6_walker *w)
 +{
 +      struct rt6_info *rt;
 +
 +      for (rt = w->leaf; rt; rt = rt->dst.rt6_next)
 +              fib6_rt_dump(rt, w->args);
 +      w->leaf = NULL;
 +      return 0;
 +}
 +
 +static void fib6_table_dump(struct net *net, struct fib6_table *tb,
 +                          struct fib6_walker *w)
 +{
 +      w->root = &tb->tb6_root;
 +      read_lock_bh(&tb->tb6_lock);
 +      fib6_walk(net, w);
 +      read_unlock_bh(&tb->tb6_lock);
 +}
 +
 +/* Called with rcu_read_lock() */
 +int fib6_tables_dump(struct net *net, struct notifier_block *nb)
 +{
 +      struct fib6_dump_arg arg;
 +      struct fib6_walker *w;
 +      unsigned int h;
 +
 +      w = kzalloc(sizeof(*w), GFP_ATOMIC);
 +      if (!w)
 +              return -ENOMEM;
 +
 +      w->func = fib6_node_dump;
 +      arg.net = net;
 +      arg.nb = nb;
 +      w->args = &arg;
 +
 +      for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
 +              struct hlist_head *head = &net->ipv6.fib_table_hash[h];
 +              struct fib6_table *tb;
 +
 +              hlist_for_each_entry_rcu(tb, head, tb6_hlist)
 +                      fib6_table_dump(net, tb, w);
 +      }
 +
 +      kfree(w);
 +
 +      return 0;
 +}
 +
  static int fib6_dump_node(struct fib6_walker *w)
  {
        int res;
@@@ -829,6 -733,8 +829,6 @@@ static void fib6_purge_rt(struct rt6_in
                        }
                        fn = fn->parent;
                }
 -              /* No more references are possible at this point. */
 -              BUG_ON(atomic_read(&rt->rt6i_ref) != 1);
        }
  }
  
@@@ -973,8 -879,6 +973,8 @@@ add
                *ins = rt;
                rt->rt6i_node = fn;
                atomic_inc(&rt->rt6i_ref);
 +              call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD,
 +                                        rt);
                if (!info->skip_notify)
                        inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
                info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
                rt->rt6i_node = fn;
                rt->dst.rt6_next = iter->dst.rt6_next;
                atomic_inc(&rt->rt6i_ref);
 +              call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE,
 +                                        rt);
                if (!info->skip_notify)
                        inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
                if (!(fn->fn_flags & RTN_RTINFO)) {
                        fn->fn_flags |= RTN_RTINFO;
                }
                nsiblings = iter->rt6i_nsiblings;
 +              iter->rt6i_node = NULL;
                fib6_purge_rt(iter, fn, info->nl_net);
+               if (fn->rr_ptr == iter)
+                       fn->rr_ptr = NULL;
                rt6_release(iter);
  
                if (nsiblings) {
                                        break;
                                if (rt6_qualify_for_ecmp(iter)) {
                                        *ins = iter->dst.rt6_next;
 +                                      iter->rt6i_node = NULL;
                                        fib6_purge_rt(iter, fn, info->nl_net);
+                                       if (fn->rr_ptr == iter)
+                                               fn->rr_ptr = NULL;
                                        rt6_release(iter);
                                        nsiblings--;
                                } else {
@@@ -1114,7 -1018,7 +1118,7 @@@ int fib6_add(struct fib6_node *root, st
                        /* Create subtree root node */
                        sfn = node_alloc();
                        if (!sfn)
-                               goto st_failure;
+                               goto failure;
  
                        sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
                        atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
  
                        if (IS_ERR(sn)) {
                                /* If it is failed, discard just allocated
-                                  root, and then (in st_failure) stale node
+                                  root, and then (in failure) stale node
                                   in main tree.
                                 */
                                node_free(sfn);
                                err = PTR_ERR(sn);
-                               goto st_failure;
+                               goto failure;
                        }
  
                        /* Now link new subtree to main tree */
  
                        if (IS_ERR(sn)) {
                                err = PTR_ERR(sn);
-                               goto st_failure;
+                               goto failure;
                        }
                }
  
                        atomic_inc(&pn->leaf->rt6i_ref);
                }
  #endif
-               /* Always release dst as dst->__refcnt is guaranteed
-                * to be taken before entering this function
-                */
-               dst_release_immediate(&rt->dst);
+               goto failure;
        }
        return err;
  
- #ifdef CONFIG_IPV6_SUBTREES
-       /* Subtree creation failed, probably main tree node
-          is orphan. If it is, shoot it.
+ failure:
+       /* fn->leaf could be NULL if fn is an intermediate node and we
+        * failed to add the new route to it in both subtree creation
+        * failure and fib6_add_rt2node() failure case.
+        * In both cases, fib6_repair_tree() should be called to fix
+        * fn->leaf.
         */
- st_failure:
        if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
                fib6_repair_tree(info->nl_net, fn);
        /* Always release dst as dst->__refcnt is guaranteed
         */
        dst_release_immediate(&rt->dst);
        return err;
- #endif
  }
  
  /*
@@@ -1559,7 -1461,6 +1561,7 @@@ static void fib6_del_route(struct fib6_
  
        fib6_purge_rt(rt, fn, net);
  
 +      call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt);
        if (!info->skip_notify)
                inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
        rt6_release(rt);
@@@ -1940,11 -1841,6 +1942,11 @@@ static void fib6_gc_timer_cb(unsigned l
  static int __net_init fib6_net_init(struct net *net)
  {
        size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
 +      int err;
 +
 +      err = fib6_notifier_init(net);
 +      if (err)
 +              return err;
  
        spin_lock_init(&net->ipv6.fib6_gc_lock);
        rwlock_init(&net->ipv6.fib6_walker_lock);
@@@ -1997,7 -1893,6 +1999,7 @@@ out_fib_table_hash
  out_rt6_stats:
        kfree(net->ipv6.rt6_stats);
  out_timer:
 +      fib6_notifier_exit(net);
        return -ENOMEM;
  }
  
@@@ -2014,7 -1909,6 +2016,7 @@@ static void fib6_net_exit(struct net *n
        kfree(net->ipv6.fib6_main_tbl);
        kfree(net->ipv6.fib_table_hash);
        kfree(net->ipv6.rt6_stats);
 +      fib6_notifier_exit(net);
  }
  
  static struct pernet_operations fib6_net_ops = {
@@@ -2038,7 -1932,7 +2040,7 @@@ int __init fib6_init(void
                goto out_kmem_cache_create;
  
        ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib,
 -                            NULL);
 +                            0);
        if (ret)
                goto out_unregister_subsys;
  
diff --combined net/ipv6/udp.c
index 19afcaf4a22e036a8768fd1c4b4f870c8fa47d37,20039c8501eb9729619f4337a2757a2954163614..2a15f1bb6ef8643a07997b89f71f034231ea653c
@@@ -129,7 -129,7 +129,7 @@@ static void udp_v6_rehash(struct sock *
  static int compute_score(struct sock *sk, struct net *net,
                         const struct in6_addr *saddr, __be16 sport,
                         const struct in6_addr *daddr, unsigned short hnum,
 -                       int dif, bool exact_dif)
 +                       int dif, int sdif, bool exact_dif)
  {
        int score;
        struct inet_sock *inet;
        }
  
        if (sk->sk_bound_dev_if || exact_dif) {
 -              if (sk->sk_bound_dev_if != dif)
 +              bool dev_match = (sk->sk_bound_dev_if == dif ||
 +                                sk->sk_bound_dev_if == sdif);
 +
 +              if (exact_dif && !dev_match)
                        return -1;
 -              score++;
 +              if (sk->sk_bound_dev_if && dev_match)
 +                      score++;
        }
  
        if (sk->sk_incoming_cpu == raw_smp_processor_id())
  /* called with rcu_read_lock() */
  static struct sock *udp6_lib_lookup2(struct net *net,
                const struct in6_addr *saddr, __be16 sport,
 -              const struct in6_addr *daddr, unsigned int hnum, int dif,
 -              bool exact_dif, struct udp_hslot *hslot2,
 -              struct sk_buff *skb)
 +              const struct in6_addr *daddr, unsigned int hnum,
 +              int dif, int sdif, bool exact_dif,
 +              struct udp_hslot *hslot2, struct sk_buff *skb)
  {
        struct sock *sk, *result;
        int score, badness, matches = 0, reuseport = 0;
        badness = -1;
        udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
                score = compute_score(sk, net, saddr, sport,
 -                                    daddr, hnum, dif, exact_dif);
 +                                    daddr, hnum, dif, sdif, exact_dif);
                if (score > badness) {
                        reuseport = sk->sk_reuseport;
                        if (reuseport) {
  
  /* rcu_read_lock() must be held */
  struct sock *__udp6_lib_lookup(struct net *net,
 -                                    const struct in6_addr *saddr, __be16 sport,
 -                                    const struct in6_addr *daddr, __be16 dport,
 -                                    int dif, struct udp_table *udptable,
 -                                    struct sk_buff *skb)
 +                             const struct in6_addr *saddr, __be16 sport,
 +                             const struct in6_addr *daddr, __be16 dport,
 +                             int dif, int sdif, struct udp_table *udptable,
 +                             struct sk_buff *skb)
  {
        struct sock *sk, *result;
        unsigned short hnum = ntohs(dport);
                        goto begin;
  
                result = udp6_lib_lookup2(net, saddr, sport,
 -                                        daddr, hnum, dif, exact_dif,
 +                                        daddr, hnum, dif, sdif, exact_dif,
                                          hslot2, skb);
                if (!result) {
                        unsigned int old_slot2 = slot2;
                                goto begin;
  
                        result = udp6_lib_lookup2(net, saddr, sport,
 -                                                daddr, hnum, dif,
 +                                                daddr, hnum, dif, sdif,
                                                  exact_dif, hslot2,
                                                  skb);
                }
@@@ -265,7 -261,7 +265,7 @@@ begin
        badness = -1;
        sk_for_each_rcu(sk, &hslot->head) {
                score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
 -                                    exact_dif);
 +                                    sdif, exact_dif);
                if (score > badness) {
                        reuseport = sk->sk_reuseport;
                        if (reuseport) {
@@@ -298,7 -294,7 +298,7 @@@ static struct sock *__udp6_lib_lookup_s
  
        return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
                                 &iph->daddr, dport, inet6_iif(skb),
 -                               udptable, skb);
 +                               inet6_sdif(skb), udptable, skb);
  }
  
  struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
  
        return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
                                 &iph->daddr, dport, inet6_iif(skb),
 -                               &udp_table, skb);
 +                               inet6_sdif(skb), &udp_table, skb);
  }
  EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
  
@@@ -324,7 -320,7 +324,7 @@@ struct sock *udp6_lib_lookup(struct ne
        struct sock *sk;
  
        sk =  __udp6_lib_lookup(net, saddr, sport, daddr, dport,
 -                              dif, &udp_table, NULL);
 +                              dif, 0, &udp_table, NULL);
        if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
                sk = NULL;
        return sk;
@@@ -366,7 -362,8 +366,8 @@@ int udpv6_recvmsg(struct sock *sk, stru
                return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
  
  try_again:
-       peeking = off = sk_peek_offset(sk, flags);
+       peeking = flags & MSG_PEEK;
+       off = sk_peek_offset(sk, flags);
        skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
        if (!skb)
                return err;
@@@ -505,7 -502,7 +506,7 @@@ void __udp6_lib_err(struct sk_buff *skb
        struct net *net = dev_net(skb->dev);
  
        sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
 -                             inet6_iif(skb), udptable, skb);
 +                             inet6_iif(skb), 0, udptable, skb);
        if (!sk) {
                __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
                                  ICMP6_MIB_INERRORS);
@@@ -897,7 -894,7 +898,7 @@@ discard
  static struct sock *__udp6_lib_demux_lookup(struct net *net,
                        __be16 loc_port, const struct in6_addr *loc_addr,
                        __be16 rmt_port, const struct in6_addr *rmt_addr,
 -                      int dif)
 +                      int dif, int sdif)
  {
        unsigned short hnum = ntohs(loc_port);
        unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum);
  
        udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
                if (sk->sk_state == TCP_ESTABLISHED &&
 -                  INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif))
 +                  INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif, sdif))
                        return sk;
                /* Only check first socket in chain */
                break;
@@@ -923,7 -920,6 +924,7 @@@ static void udp_v6_early_demux(struct s
        struct sock *sk;
        struct dst_entry *dst;
        int dif = skb->dev->ifindex;
 +      int sdif = inet6_sdif(skb);
  
        if (!pskb_may_pull(skb, skb_transport_offset(skb) +
            sizeof(struct udphdr)))
                sk = __udp6_lib_demux_lookup(net, uh->dest,
                                             &ipv6_hdr(skb)->daddr,
                                             uh->source, &ipv6_hdr(skb)->saddr,
 -                                           dif);
 +                                           dif, sdif);
        else
                return;
  
index f6e229b51dfb39acdb0e8458062fd50db58eb9f2,6b44fe4052825a87b373bafb58ca014e3ec99015..76cf273a56c791bbc84262811c2f35e94e024dcf
@@@ -335,6 -335,8 +335,6 @@@ static int queue_gso_packets(struct dat
                             const struct dp_upcall_info *upcall_info,
                                 uint32_t cutlen)
  {
 -      unsigned short gso_type = skb_shinfo(skb)->gso_type;
 -      struct sw_flow_key later_key;
        struct sk_buff *segs, *nskb;
        int err;
  
        if (segs == NULL)
                return -EINVAL;
  
 -      if (gso_type & SKB_GSO_UDP) {
 -              /* The initial flow key extracted by ovs_flow_key_extract()
 -               * in this case is for a first fragment, so we need to
 -               * properly mark later fragments.
 -               */
 -              later_key = *key;
 -              later_key.ip.frag = OVS_FRAG_TYPE_LATER;
 -      }
 -
        /* Queue all of the segments. */
        skb = segs;
        do {
 -              if (gso_type & SKB_GSO_UDP && skb != segs)
 -                      key = &later_key;
 -
                err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen);
                if (err)
                        break;
  }
  
  static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
-                             unsigned int hdrlen)
+                             unsigned int hdrlen, int actions_attrlen)
  {
        size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
                + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
  
        /* OVS_PACKET_ATTR_ACTIONS */
        if (upcall_info->actions_len)
-               size += nla_total_size(upcall_info->actions_len);
+               size += nla_total_size(actions_attrlen);
  
        /* OVS_PACKET_ATTR_MRU */
        if (upcall_info->mru)
@@@ -451,7 -465,8 +451,8 @@@ static int queue_userspace_packet(struc
        else
                hlen = skb->len;
  
-       len = upcall_msg_size(upcall_info, hlen - cutlen);
+       len = upcall_msg_size(upcall_info, hlen - cutlen,
+                             OVS_CB(skb)->acts_origlen);
        user_skb = genlmsg_new(len, GFP_ATOMIC);
        if (!user_skb) {
                err = -ENOMEM;
diff --combined net/sched/cls_api.c
index ebeeb87e6d44da32b998d517f17586ced8790005,9fd44c22134783edf3db4f62ff1e8184c455cbd7..eef6b077f30ed69d2dc04bea61a851dc449cafee
@@@ -100,6 -100,21 +100,6 @@@ int unregister_tcf_proto_ops(struct tcf
  }
  EXPORT_SYMBOL(unregister_tcf_proto_ops);
  
 -static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 -                        struct nlmsghdr *n, struct tcf_proto *tp,
 -                        unsigned long fh, int event, bool unicast);
 -
 -static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
 -                               struct nlmsghdr *n,
 -                               struct tcf_chain *chain, int event)
 -{
 -      struct tcf_proto *tp;
 -
 -      for (tp = rtnl_dereference(chain->filter_chain);
 -           tp; tp = rtnl_dereference(tp->next))
 -              tfilter_notify(net, oskb, n, tp, 0, event, false);
 -}
 -
  /* Select new prio value from the range, managed by kernel. */
  
  static inline u32 tcf_auto_prio(struct tcf_proto *tp)
@@@ -190,7 -205,7 +190,7 @@@ static void tcf_chain_flush(struct tcf_
  {
        struct tcf_proto *tp;
  
-       if (*chain->p_filter_chain)
+       if (chain->p_filter_chain)
                RCU_INIT_POINTER(*chain->p_filter_chain, NULL);
        while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) {
                RCU_INIT_POINTER(chain->filter_chain, tp->next);
@@@ -392,109 -407,6 +392,109 @@@ static struct tcf_proto *tcf_chain_tp_f
        return tp;
  }
  
 +static int tcf_fill_node(struct net *net, struct sk_buff *skb,
 +                       struct tcf_proto *tp, void *fh, u32 portid,
 +                       u32 seq, u16 flags, int event)
 +{
 +      struct tcmsg *tcm;
 +      struct nlmsghdr  *nlh;
 +      unsigned char *b = skb_tail_pointer(skb);
 +
 +      nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
 +      if (!nlh)
 +              goto out_nlmsg_trim;
 +      tcm = nlmsg_data(nlh);
 +      tcm->tcm_family = AF_UNSPEC;
 +      tcm->tcm__pad1 = 0;
 +      tcm->tcm__pad2 = 0;
 +      tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
 +      tcm->tcm_parent = tp->classid;
 +      tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
 +      if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
 +              goto nla_put_failure;
 +      if (nla_put_u32(skb, TCA_CHAIN, tp->chain->index))
 +              goto nla_put_failure;
 +      if (!fh) {
 +              tcm->tcm_handle = 0;
 +      } else {
 +              if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
 +                      goto nla_put_failure;
 +      }
 +      nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 +      return skb->len;
 +
 +out_nlmsg_trim:
 +nla_put_failure:
 +      nlmsg_trim(skb, b);
 +      return -1;
 +}
 +
 +static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 +                        struct nlmsghdr *n, struct tcf_proto *tp,
 +                        void *fh, int event, bool unicast)
 +{
 +      struct sk_buff *skb;
 +      u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 +
 +      skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 +      if (!skb)
 +              return -ENOBUFS;
 +
 +      if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq,
 +                        n->nlmsg_flags, event) <= 0) {
 +              kfree_skb(skb);
 +              return -EINVAL;
 +      }
 +
 +      if (unicast)
 +              return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
 +
 +      return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 +                            n->nlmsg_flags & NLM_F_ECHO);
 +}
 +
 +static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
 +                            struct nlmsghdr *n, struct tcf_proto *tp,
 +                            void *fh, bool unicast, bool *last)
 +{
 +      struct sk_buff *skb;
 +      u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 +      int err;
 +
 +      skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 +      if (!skb)
 +              return -ENOBUFS;
 +
 +      if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq,
 +                        n->nlmsg_flags, RTM_DELTFILTER) <= 0) {
 +              kfree_skb(skb);
 +              return -EINVAL;
 +      }
 +
 +      err = tp->ops->delete(tp, fh, last);
 +      if (err) {
 +              kfree_skb(skb);
 +              return err;
 +      }
 +
 +      if (unicast)
 +              return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
 +
 +      return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 +                            n->nlmsg_flags & NLM_F_ECHO);
 +}
 +
 +static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
 +                               struct nlmsghdr *n,
 +                               struct tcf_chain *chain, int event)
 +{
 +      struct tcf_proto *tp;
 +
 +      for (tp = rtnl_dereference(chain->filter_chain);
 +           tp; tp = rtnl_dereference(tp->next))
 +              tfilter_notify(net, oskb, n, tp, 0, event, false);
 +}
 +
  /* Add/change/delete/get a filter node */
  
  static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
        struct tcf_proto *tp;
        const struct Qdisc_class_ops *cops;
        unsigned long cl;
 -      unsigned long fh;
 +      void *fh;
        int err;
        int tp_created;
  
@@@ -655,7 -567,7 +655,7 @@@ replay
  
        fh = tp->ops->get(tp, t->tcm_handle);
  
 -      if (fh == 0) {
 +      if (!fh) {
                if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
                        tcf_chain_tp_remove(chain, &chain_info, tp);
                        tfilter_notify(net, skb, n, tp, fh,
                        }
                        break;
                case RTM_DELTFILTER:
 -                      err = tp->ops->delete(tp, fh, &last);
 +                      err = tfilter_del_notify(net, skb, n, tp, fh, false,
 +                                               &last);
                        if (err)
                                goto errout;
 -                      tfilter_notify(net, skb, n, tp, t->tcm_handle,
 -                                     RTM_DELTFILTER, false);
                        if (last) {
                                tcf_chain_tp_remove(chain, &chain_info, tp);
                                tcf_proto_destroy(tp);
@@@ -724,13 -637,75 +724,13 @@@ errout
        return err;
  }
  
 -static int tcf_fill_node(struct net *net, struct sk_buff *skb,
 -                       struct tcf_proto *tp, unsigned long fh, u32 portid,
 -                       u32 seq, u16 flags, int event)
 -{
 -      struct tcmsg *tcm;
 -      struct nlmsghdr  *nlh;
 -      unsigned char *b = skb_tail_pointer(skb);
 -
 -      nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
 -      if (!nlh)
 -              goto out_nlmsg_trim;
 -      tcm = nlmsg_data(nlh);
 -      tcm->tcm_family = AF_UNSPEC;
 -      tcm->tcm__pad1 = 0;
 -      tcm->tcm__pad2 = 0;
 -      tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
 -      tcm->tcm_parent = tp->classid;
 -      tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
 -      if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
 -              goto nla_put_failure;
 -      if (nla_put_u32(skb, TCA_CHAIN, tp->chain->index))
 -              goto nla_put_failure;
 -      tcm->tcm_handle = fh;
 -      if (RTM_DELTFILTER != event) {
 -              tcm->tcm_handle = 0;
 -              if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
 -                      goto nla_put_failure;
 -      }
 -      nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 -      return skb->len;
 -
 -out_nlmsg_trim:
 -nla_put_failure:
 -      nlmsg_trim(skb, b);
 -      return -1;
 -}
 -
 -static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 -                        struct nlmsghdr *n, struct tcf_proto *tp,
 -                        unsigned long fh, int event, bool unicast)
 -{
 -      struct sk_buff *skb;
 -      u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 -
 -      skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 -      if (!skb)
 -              return -ENOBUFS;
 -
 -      if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq,
 -                        n->nlmsg_flags, event) <= 0) {
 -              kfree_skb(skb);
 -              return -EINVAL;
 -      }
 -
 -      if (unicast)
 -              return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
 -
 -      return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 -                            n->nlmsg_flags & NLM_F_ECHO);
 -}
 -
  struct tcf_dump_args {
        struct tcf_walker w;
        struct sk_buff *skb;
        struct netlink_callback *cb;
  };
  
 -static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
 -                       struct tcf_walker *arg)
 +static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
  {
        struct tcf_dump_args *a = (void *)arg;
        struct net *net = sock_net(a->skb->sk);
@@@ -908,12 -883,18 +908,12 @@@ int tcf_exts_validate(struct net *net, 
  }
  EXPORT_SYMBOL(tcf_exts_validate);
  
 -void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
 -                   struct tcf_exts *src)
 +void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src)
  {
  #ifdef CONFIG_NET_CLS_ACT
        struct tcf_exts old = *dst;
  
 -      tcf_tree_lock(tp);
 -      dst->nr_actions = src->nr_actions;
 -      dst->actions = src->actions;
 -      dst->type = src->type;
 -      tcf_tree_unlock(tp);
 -
 +      *dst = *src;
        tcf_exts_destroy(&old);
  #endif
  }
@@@ -934,7 -915,7 +934,7 @@@ int tcf_exts_dump(struct sk_buff *skb, 
  #ifdef CONFIG_NET_CLS_ACT
        struct nlattr *nest;
  
 -      if (exts->action && exts->nr_actions) {
 +      if (exts->action && tcf_exts_has_actions(exts)) {
                /*
                 * again for backward compatible mode - we want
                 * to work with both old and new modes of entering
@@@ -991,7 -972,7 +991,7 @@@ int tcf_exts_get_dev(struct net_device 
        const struct tc_action *a;
        LIST_HEAD(actions);
  
 -      if (tc_no_actions(exts))
 +      if (!tcf_exts_has_actions(exts))
                return -EINVAL;
  
        tcf_exts_to_list(exts, &actions);
@@@ -1010,10 -991,10 +1010,10 @@@ EXPORT_SYMBOL(tcf_exts_get_dev)
  
  static int __init tc_filter_init(void)
  {
 -      rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
 -      rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, NULL);
 +      rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0);
 +      rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0);
        rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
 -                    tc_dump_tfilter, NULL);
 +                    tc_dump_tfilter, 0);
  
        return 0;
  }
diff --combined net/sctp/ipv6.c
index a2a1c1d08d512d3515a5bc2ef2c9406cda385489,a4b6ffb6149541b78e39aceae859224d13487106..51c4887695909d171285b98ce1be779a3adedbab
@@@ -243,8 -243,8 +243,8 @@@ static void sctp_v6_get_dst(struct sctp
        union sctp_addr *daddr = &t->ipaddr;
        union sctp_addr dst_saddr;
        struct in6_addr *final_p, final;
 +      enum sctp_scope scope;
        __u8 matchlen = 0;
 -      sctp_scope_t scope;
  
        memset(fl6, 0, sizeof(struct flowi6));
        fl6->daddr = daddr->v6.sin6_addr;
@@@ -497,7 -497,7 +497,7 @@@ static void sctp_v6_from_addr_param(uni
  static int sctp_v6_to_addr_param(const union sctp_addr *addr,
                                 union sctp_addr_param *param)
  {
 -      int length = sizeof(sctp_ipv6addr_param_t);
 +      int length = sizeof(struct sctp_ipv6addr_param);
  
        param->v6.param_hdr.type = SCTP_PARAM_IPV6_ADDRESS;
        param->v6.param_hdr.length = htons(length);
@@@ -512,7 -512,9 +512,9 @@@ static void sctp_v6_to_addr(union sctp_
  {
        addr->sa.sa_family = AF_INET6;
        addr->v6.sin6_port = port;
+       addr->v6.sin6_flowinfo = 0;
        addr->v6.sin6_addr = *saddr;
+       addr->v6.sin6_scope_id = 0;
  }
  
  /* Compare addresses exactly.
@@@ -624,10 -626,10 +626,10 @@@ static int sctp_v6_addr_valid(union sct
  }
  
  /* What is the scope of 'addr'?  */
 -static sctp_scope_t sctp_v6_scope(union sctp_addr *addr)
 +static enum sctp_scope sctp_v6_scope(union sctp_addr *addr)
  {
 +      enum sctp_scope retval;
        int v6scope;
 -      sctp_scope_t retval;
  
        /* The IPv6 scope is really a set of bit fields.
         * See IFA_* in <net/if_inet6.h>.  Map to a generic SCTP scope.
diff --combined net/unix/af_unix.c
index 5c53f22d62e8d6ef79eca921ae5a361443d73385,be8982b4f8c00be8bb95748c1c33a76e13079dff..7f46bab4ce5c84aa285d8141b4e0f822e8dab01f
@@@ -1528,13 -1528,26 +1528,13 @@@ static inline bool too_many_unix_fds(st
        return false;
  }
  
 -#define MAX_RECURSION_LEVEL 4
 -
  static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
  {
        int i;
 -      unsigned char max_level = 0;
  
        if (too_many_unix_fds(current))
                return -ETOOMANYREFS;
  
 -      for (i = scm->fp->count - 1; i >= 0; i--) {
 -              struct sock *sk = unix_get_socket(scm->fp->fp[i]);
 -
 -              if (sk)
 -                      max_level = max(max_level,
 -                                      unix_sk(sk)->recursion_level);
 -      }
 -      if (unlikely(max_level > MAX_RECURSION_LEVEL))
 -              return -ETOOMANYREFS;
 -
        /*
         * Need to duplicate file references for the sake of garbage
         * collection.  Otherwise a socket in the fps might become a
  
        for (i = scm->fp->count - 1; i >= 0; i--)
                unix_inflight(scm->fp->user, scm->fp->fp[i]);
 -      return max_level;
 +      return 0;
  }
  
  static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
@@@ -1636,6 -1649,7 +1636,6 @@@ static int unix_dgram_sendmsg(struct so
        struct sk_buff *skb;
        long timeo;
        struct scm_cookie scm;
 -      int max_level;
        int data_len = 0;
        int sk_locked;
  
        err = unix_scm_to_skb(&scm, skb, true);
        if (err < 0)
                goto out_free;
 -      max_level = err + 1;
  
        skb_put(skb, len - data_len);
        skb->data_len = data_len;
@@@ -1804,6 -1819,8 +1804,6 @@@ restart_locked
                __net_timestamp(skb);
        maybe_add_creds(skb, sock, other);
        skb_queue_tail(&other->sk_receive_queue, skb);
 -      if (max_level > unix_sk(other)->recursion_level)
 -              unix_sk(other)->recursion_level = max_level;
        unix_state_unlock(other);
        other->sk_data_ready(other);
        sock_put(other);
@@@ -1838,6 -1855,7 +1838,6 @@@ static int unix_stream_sendmsg(struct s
        int sent = 0;
        struct scm_cookie scm;
        bool fds_sent = false;
 -      int max_level;
        int data_len;
  
        wait_for_unix_gc();
                        kfree_skb(skb);
                        goto out_err;
                }
 -              max_level = err + 1;
                fds_sent = true;
  
                skb_put(skb, size - data_len);
  
                maybe_add_creds(skb, sock, other);
                skb_queue_tail(&other->sk_receive_queue, skb);
 -              if (max_level > unix_sk(other)->recursion_level)
 -                      unix_sk(other)->recursion_level = max_level;
                unix_state_unlock(other);
                other->sk_data_ready(other);
                sent += size;
@@@ -2283,10 -2304,7 +2283,7 @@@ static int unix_stream_read_generic(str
         */
        mutex_lock(&u->iolock);
  
-       if (flags & MSG_PEEK)
-               skip = sk_peek_offset(sk, flags);
-       else
-               skip = 0;
+       skip = max(sk_peek_offset(sk, flags), 0);
  
        do {
                int chunk;
@@@ -2303,6 -2321,7 +2300,6 @@@ redo
                last_len = last ? last->len : 0;
  again:
                if (skb == NULL) {
 -                      unix_sk(sk)->recursion_level = 0;
                        if (copied >= target)
                                goto unlock;
  
diff --combined tools/lib/bpf/libbpf.c
index 1cc3ea0ffdc3b38bf56b8c531374859501febd83,8c67a90dbd8229062fe1c0a9ae317fd5f3c1689b..35f6dfcdc56518528b964f04d7bc7033e1a2b36b
@@@ -879,7 -879,8 +879,8 @@@ bpf_object__create_maps(struct bpf_obje
                        size_t j;
                        int err = *pfd;
  
-                       pr_warning("failed to create map: %s\n",
+                       pr_warning("failed to create map (name: '%s'): %s\n",
+                                  obj->maps[i].name,
                                   strerror(errno));
                        for (j = 0; j < i; j++)
                                zclose(obj->maps[j].fd);
@@@ -1744,32 -1745,3 +1745,32 @@@ long libbpf_get_error(const void *ptr
                return PTR_ERR(ptr);
        return 0;
  }
 +
 +int bpf_prog_load(const char *file, enum bpf_prog_type type,
 +                struct bpf_object **pobj, int *prog_fd)
 +{
 +      struct bpf_program *prog;
 +      struct bpf_object *obj;
 +      int err;
 +
 +      obj = bpf_object__open(file);
 +      if (IS_ERR(obj))
 +              return -ENOENT;
 +
 +      prog = bpf_program__next(NULL, obj);
 +      if (!prog) {
 +              bpf_object__close(obj);
 +              return -ENOENT;
 +      }
 +
 +      bpf_program__set_type(prog, type);
 +      err = bpf_object__load(obj);
 +      if (err) {
 +              bpf_object__close(obj);
 +              return -EINVAL;
 +      }
 +
 +      *pobj = obj;
 +      *prog_fd = bpf_program__fd(prog);
 +      return 0;
 +}