Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
authorJakub Kicinski <kuba@kernel.org>
Thu, 15 Feb 2024 22:01:43 +0000 (14:01 -0800)
committerJakub Kicinski <kuba@kernel.org>
Fri, 16 Feb 2024 00:20:04 +0000 (16:20 -0800)
Cross-merge networking fixes after downstream PR.

No conflicts.

Adjacent changes:

net/core/dev.c
  9f30831390ed ("net: add rcu safety to rtnl_prop_list_size()")
  723de3ebef03 ("net: free altname using an RCU callback")

net/unix/garbage.c
  11498715f266 ("af_unix: Remove io_uring code for GC.")
  25236c91b5ab ("af_unix: Fix task hung while purging oob_skb in GC.")

drivers/net/ethernet/renesas/ravb_main.c
  ed4adc07207d ("net: ravb: Count packets instead of descriptors in GbEth RX path"
)
  c2da9408579d ("ravb: Add Rx checksum offload support for GbEth")

net/mptcp/protocol.c
  bdd70eb68913 ("mptcp: drop the push_pending field")
  28e5c1380506 ("mptcp: annotate lockless accesses around read-mostly fields")

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
27 files changed:
1  2 
.mailmap
Documentation/netlink/specs/dpll.yaml
MAINTAINERS
drivers/dpll/dpll_netlink.c
drivers/net/bonding/bond_main.c
drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c
drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
drivers/net/ethernet/mellanox/mlx5/core/dpll.c
drivers/net/ethernet/renesas/ravb_main.c
include/linux/netdevice.h
net/core/dev.c
net/core/rtnetlink.c
net/ipv4/ip_gre.c
net/ipv4/ip_output.c
net/ipv4/ip_tunnel.c
net/ipv4/ip_vti.c
net/ipv4/ipip.c
net/ipv6/sit.c
net/mptcp/options.c
net/mptcp/protocol.c
net/mptcp/protocol.h
net/mptcp/subflow.c
net/sched/act_mirred.c
net/smc/af_smc.c
net/unix/garbage.c
net/xfrm/xfrm_user.c
tools/testing/selftests/net/openvswitch/openvswitch.sh

diff --combined .mailmap
index 95b4fe465cc6dd9d33b8f0660b171c8f563cb6b6,b99a238ee3bde17fdf4e0f6b9ca0aee81e1dc9a7..1eb607efcc6ea49075cc250b0595c4d9f15e41f8
+++ b/.mailmap
@@@ -191,10 -191,11 +191,11 @@@ Gao Xiang <xiang@kernel.org> <gaoxiang2
  Gao Xiang <xiang@kernel.org> <hsiangkao@aol.com>
  Gao Xiang <xiang@kernel.org> <hsiangkao@linux.alibaba.com>
  Gao Xiang <xiang@kernel.org> <hsiangkao@redhat.com>
- Geliang Tang <geliang.tang@linux.dev> <geliang.tang@suse.com>
- Geliang Tang <geliang.tang@linux.dev> <geliangtang@xiaomi.com>
- Geliang Tang <geliang.tang@linux.dev> <geliangtang@gmail.com>
- Geliang Tang <geliang.tang@linux.dev> <geliangtang@163.com>
+ Geliang Tang <geliang@kernel.org> <geliang.tang@linux.dev>
+ Geliang Tang <geliang@kernel.org> <geliang.tang@suse.com>
+ Geliang Tang <geliang@kernel.org> <geliangtang@xiaomi.com>
+ Geliang Tang <geliang@kernel.org> <geliangtang@gmail.com>
+ Geliang Tang <geliang@kernel.org> <geliangtang@163.com>
  Georgi Djakov <djakov@kernel.org> <georgi.djakov@linaro.org>
  Gerald Schaefer <gerald.schaefer@linux.ibm.com> <geraldsc@de.ibm.com>
  Gerald Schaefer <gerald.schaefer@linux.ibm.com> <gerald.schaefer@de.ibm.com>
@@@ -289,6 -290,7 +290,7 @@@ Johan Hovold <johan@kernel.org> <johan@
  John Crispin <john@phrozen.org> <blogic@openwrt.org>
  John Fastabend <john.fastabend@gmail.com> <john.r.fastabend@intel.com>
  John Keeping <john@keeping.me.uk> <john@metanate.com>
+ John Moon <john@jmoon.dev> <quic_johmoo@quicinc.com>
  John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
  John Stultz <johnstul@us.ibm.com>
  <jon.toppins+linux@gmail.com> <jtoppins@cumulusnetworks.com>
@@@ -344,6 -346,7 +346,7 @@@ Leonid I Ananiev <leonid.i.ananiev@inte
  Leon Romanovsky <leon@kernel.org> <leon@leon.nu>
  Leon Romanovsky <leon@kernel.org> <leonro@mellanox.com>
  Leon Romanovsky <leon@kernel.org> <leonro@nvidia.com>
+ Leo Yan <leo.yan@linux.dev> <leo.yan@linaro.org>
  Liam Mark <quic_lmark@quicinc.com> <lmark@codeaurora.org>
  Linas Vepstas <linas@austin.ibm.com>
  Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@ascom.ch>
@@@ -568,7 -571,6 +571,7 @@@ Simon Kelley <simon@thekelleys.org.uk
  Sricharan Ramabadhran <quic_srichara@quicinc.com> <sricharan@codeaurora.org>
  Srinivas Ramana <quic_sramana@quicinc.com> <sramana@codeaurora.org>
  Sriram R <quic_srirrama@quicinc.com> <srirrama@codeaurora.org>
 +Stefan Wahren <wahrenst@gmx.net> <stefan.wahren@i2se.com>
  Stéphane Witzmann <stephane.witzmann@ubpmes.univ-bpclermont.fr>
  Stephen Hemminger <stephen@networkplumber.org> <shemminger@linux-foundation.org>
  Stephen Hemminger <stephen@networkplumber.org> <shemminger@osdl.org>
index 1755066d8308c3f6a2b138090fe35033bce78367,3dcc9ece272aad6842a6297c6d5bf2cca2c2acc3..8dc1df5cfae7466506d6ca163cdae2d6515d377c
@@@ -51,40 -51,6 +51,40 @@@ definitions
            if dpll lock-state was not DPLL_LOCK_STATUS_LOCKED_HO_ACQ, the
            dpll's lock-state shall remain DPLL_LOCK_STATUS_UNLOCKED)
      render-max: true
 +  -
 +    type: enum
 +    name: lock-status-error
 +    doc: |
 +      if previous status change was done due to a failure, this provides
 +      information of dpll device lock status error.
 +      Valid values for DPLL_A_LOCK_STATUS_ERROR attribute
 +    entries:
 +      -
 +        name: none
 +        doc: |
 +          dpll device lock status was changed without any error
 +        value: 1
 +      -
 +        name: undefined
 +        doc: |
 +          dpll device lock status was changed due to undefined error.
 +          Driver fills this value up in case it is not able
 +          to obtain suitable exact error type.
 +      -
 +        name: media-down
 +        doc: |
 +          dpll device lock status was changed because of associated
 +          media got down.
 +          This may happen for example if dpll device was previously
 +          locked on an input pin of type PIN_TYPE_SYNCE_ETH_PORT.
 +      -
 +        name: fractional-frequency-offset-too-high
 +        doc: |
 +          the FFO (Fractional Frequency Offset) between the RX and TX
 +          symbol rate on the media got too high.
 +          This may happen for example if dpll device was previously
 +          locked on an input pin of type PIN_TYPE_SYNCE_ETH_PORT.
 +    render-max: true
    -
      type: const
      name: temp-divider
@@@ -248,10 -214,6 +248,10 @@@ attribute-sets
          name: type
          type: u32
          enum: type
 +      -
 +        name: lock-status-error
 +        type: u32
 +        enum: lock-status-error
    -
      name: pin
      enum-name: dpll_a_pin
@@@ -417,14 -379,11 +417,12 @@@ operations
              - mode
              - mode-supported
              - lock-status
 +            - lock-status-error
              - temp
              - clock-id
              - type
  
        dump:
-         pre: dpll-lock-dumpit
-         post: dpll-unlock-dumpit
          reply: *dev-attrs
  
      -
              - fractional-frequency-offset
  
        dump:
-         pre: dpll-lock-dumpit
-         post: dpll-unlock-dumpit
          request:
            attributes:
              - id
diff --combined MAINTAINERS
index 2b775f4369e078d1d53063d0eae273879965c288,a0697e2fb8e8bc97a9062bdd8771564dd3969fe8..ac771f6e4a3a423cb1d7369118c1526e1d879603
@@@ -3799,7 -3799,6 +3799,7 @@@ M:      Alexei Starovoitov <ast@kernel.org
  M:    Daniel Borkmann <daniel@iogearbox.net>
  M:    Andrii Nakryiko <andrii@kernel.org>
  R:    Martin KaFai Lau <martin.lau@linux.dev>
 +R:    Eduard Zingerman <eddyz87@gmail.com>
  R:    Song Liu <song@kernel.org>
  R:    Yonghong Song <yonghong.song@linux.dev>
  R:    John Fastabend <john.fastabend@gmail.com>
@@@ -3860,7 -3859,6 +3860,7 @@@ F:      net/unix/unix_bpf.
  
  BPF [LIBRARY] (libbpf)
  M:    Andrii Nakryiko <andrii@kernel.org>
 +M:    Eduard Zingerman <eddyz87@gmail.com>
  L:    bpf@vger.kernel.org
  S:    Maintained
  F:    tools/lib/bpf/
@@@ -3918,7 -3916,6 +3918,7 @@@ F:      security/bpf
  
  BPF [SELFTESTS] (Test Runners & Infrastructure)
  M:    Andrii Nakryiko <andrii@kernel.org>
 +M:    Eduard Zingerman <eddyz87@gmail.com>
  R:    Mykola Lysenko <mykolal@fb.com>
  L:    bpf@vger.kernel.org
  S:    Maintained
@@@ -4632,8 -4629,8 +4632,8 @@@ S:      Maintaine
  F:    net/sched/sch_cake.c
  
  CAN NETWORK DRIVERS
 -M:    Wolfgang Grandegger <wg@grandegger.com>
  M:    Marc Kleine-Budde <mkl@pengutronix.de>
 +M:    Vincent Mailhol <mailhol.vincent@wanadoo.fr>
  L:    linux-can@vger.kernel.org
  S:    Maintained
  W:    https://github.com/linux-can
@@@ -7887,13 -7884,6 +7887,13 @@@ S:    Maintaine
  F:    include/linux/errseq.h
  F:    lib/errseq.c
  
 +ESD CAN NETWORK DRIVERS
 +M:    Stefan Mätje <stefan.maetje@esd.eu>
 +R:    socketcan@esd.eu
 +L:    linux-can@vger.kernel.org
 +S:    Maintained
 +F:    drivers/net/can/esd/
 +
  ESD CAN/USB DRIVERS
  M:    Frank Jungclaus <frank.jungclaus@esd.eu>
  R:    socketcan@esd.eu
@@@ -10811,11 -10801,11 +10811,11 @@@ F:        drivers/gpio/gpio-tangier.
  
  INTEL GVT-g DRIVERS (Intel GPU Virtualization)
  M:    Zhenyu Wang <zhenyuw@linux.intel.com>
- M:    Zhi Wang <zhi.a.wang@intel.com>
+ M:    Zhi Wang <zhi.wang.linux@gmail.com>
  L:    intel-gvt-dev@lists.freedesktop.org
  L:    intel-gfx@lists.freedesktop.org
  S:    Supported
- W:    https://01.org/igvt-g
+ W:    https://github.com/intel/gvt-linux/wiki
  T:    git https://github.com/intel/gvt-linux.git
  F:    drivers/gpu/drm/i915/gvt/
  
@@@ -13066,15 -13056,6 +13066,15 @@@ L: netdev@vger.kernel.or
  S:    Supported
  F:    drivers/net/ethernet/marvell/octeon_ep
  
 +MARVELL OCTEON ENDPOINT VF DRIVER
 +M:    Veerasenareddy Burru <vburru@marvell.com>
 +M:    Sathesh Edara <sedara@marvell.com>
 +M:    Shinas Rasheed <srasheed@marvell.com>
 +M:    Satananda Burla <sburla@marvell.com>
 +L:    netdev@vger.kernel.org
 +S:    Supported
 +F:    drivers/net/ethernet/marvell/octeon_ep_vf
 +
  MARVELL OCTEONTX2 PHYSICAL FUNCTION DRIVER
  M:    Sunil Goutham <sgoutham@marvell.com>
  M:    Geetha sowjanya <gakula@marvell.com>
@@@ -15102,7 -15083,6 +15102,7 @@@ NETDEVSI
  M:    Jakub Kicinski <kuba@kernel.org>
  S:    Maintained
  F:    drivers/net/netdevsim/*
 +F:    tools/testing/selftests/drivers/net/netdevsim/*
  
  NETEM NETWORK EMULATOR
  M:    Stephen Hemminger <stephen@networkplumber.org>
@@@ -15344,7 -15324,7 +15344,7 @@@ K:   \bmdo
  NETWORKING [MPTCP]
  M:    Matthieu Baerts <matttbe@kernel.org>
  M:    Mat Martineau <martineau@kernel.org>
- R:    Geliang Tang <geliang.tang@linux.dev>
+ R:    Geliang Tang <geliang@kernel.org>
  L:    netdev@vger.kernel.org
  L:    mptcp@lists.linux.dev
  S:    Maintained
@@@ -17202,7 -17182,7 +17202,7 @@@ R:   John Garry <john.g.garry@oracle.com
  R:    Will Deacon <will@kernel.org>
  R:    James Clark <james.clark@arm.com>
  R:    Mike Leach <mike.leach@linaro.org>
- R:    Leo Yan <leo.yan@linaro.org>
+ R:    Leo Yan <leo.yan@linux.dev>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Supported
  F:    tools/build/feature/test-libopencsd.c
@@@ -18033,13 -18013,6 +18033,13 @@@ T: git git://git.kernel.org/pub/scm/lin
  F:    Documentation/devicetree/bindings/net/wireless/qca,ath9k.yaml
  F:    drivers/net/wireless/ath/ath9k/
  
 +QUALCOMM ATHEROS QCA7K ETHERNET DRIVER
 +M:    Stefan Wahren <wahrenst@gmx.net>
 +L:    netdev@vger.kernel.org
 +S:    Maintained
 +F:    Documentation/devicetree/bindings/net/qca,qca7000.txt
 +F:    drivers/net/ethernet/qualcomm/qca*
 +
  QUALCOMM BAM-DMUX WWAN NETWORK DRIVER
  M:    Stephan Gerhold <stephan@gerhold.net>
  L:    netdev@vger.kernel.org
@@@ -24149,6 -24122,7 +24149,6 @@@ F:   drivers/net/ethernet/xilinx/xilinx_a
  
  XILINX CAN DRIVER
  M:    Appana Durga Kedareswara rao <appana.durga.rao@xilinx.com>
 -R:    Naga Sureshkumar Relli <naga.sureshkumar.relli@xilinx.com>
  L:    linux-can@vger.kernel.org
  S:    Maintained
  F:    Documentation/devicetree/bindings/net/can/xilinx,can.yaml
index cf3313517ae17f82e7b42eaf905040b47097f5ff,4ca9ad16cd957aaefaf50a74bbeb27ab3f3d1ec7..1419fd0d241c281497a67ba3a6b8dd1b345f9ec3
@@@ -121,21 -121,14 +121,21 @@@ dpll_msg_add_lock_status(struct sk_buf
                         struct netlink_ext_ack *extack)
  {
        const struct dpll_device_ops *ops = dpll_device_ops(dpll);
 +      enum dpll_lock_status_error status_error = 0;
        enum dpll_lock_status status;
        int ret;
  
 -      ret = ops->lock_status_get(dpll, dpll_priv(dpll), &status, extack);
 +      ret = ops->lock_status_get(dpll, dpll_priv(dpll), &status,
 +                                 &status_error, extack);
        if (ret)
                return ret;
        if (nla_put_u32(msg, DPLL_A_LOCK_STATUS, status))
                return -EMSGSIZE;
 +      if (status_error &&
 +          (status == DPLL_LOCK_STATUS_UNLOCKED ||
 +           status == DPLL_LOCK_STATUS_HOLDOVER) &&
 +          nla_put_u32(msg, DPLL_A_LOCK_STATUS_ERROR, status_error))
 +              return -EMSGSIZE;
  
        return 0;
  }
@@@ -1206,6 -1199,7 +1206,7 @@@ int dpll_nl_pin_get_dumpit(struct sk_bu
        unsigned long i;
        int ret = 0;
  
+       mutex_lock(&dpll_lock);
        xa_for_each_marked_start(&dpll_pin_xa, i, pin, DPLL_REGISTERED,
                                 ctx->idx) {
                if (!dpll_pin_available(pin))
                }
                genlmsg_end(skb, hdr);
        }
+       mutex_unlock(&dpll_lock);
        if (ret == -EMSGSIZE) {
                ctx->idx = i;
                return skb->len;
@@@ -1380,6 -1376,7 +1383,7 @@@ int dpll_nl_device_get_dumpit(struct sk
        unsigned long i;
        int ret = 0;
  
+       mutex_lock(&dpll_lock);
        xa_for_each_marked_start(&dpll_device_xa, i, dpll, DPLL_REGISTERED,
                                 ctx->idx) {
                hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
                }
                genlmsg_end(skb, hdr);
        }
+       mutex_unlock(&dpll_lock);
        if (ret == -EMSGSIZE) {
                ctx->idx = i;
                return skb->len;
@@@ -1446,20 -1445,6 +1452,6 @@@ dpll_unlock_doit(const struct genl_spli
        mutex_unlock(&dpll_lock);
  }
  
- int dpll_lock_dumpit(struct netlink_callback *cb)
- {
-       mutex_lock(&dpll_lock);
-       return 0;
- }
- int dpll_unlock_dumpit(struct netlink_callback *cb)
- {
-       mutex_unlock(&dpll_lock);
-       return 0;
- }
  int dpll_pin_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
                      struct genl_info *info)
  {
index cb67ece47328cc50c6158cc0408d1820ef8c6dd4,a11748b8d69b435cf97971cec21c0340365ed6d1..a8a6c53095186cc32bb3c5ece1aaaff301f5056b
@@@ -1819,6 -1819,8 +1819,8 @@@ void bond_xdp_set_features(struct net_d
        bond_for_each_slave(bond, slave, iter)
                val &= slave->dev->xdp_features;
  
+       val &= ~NETDEV_XDP_ACT_XSK_ZEROCOPY;
        xdp_set_features_flag(bond_dev, val);
  }
  
@@@ -5909,9 -5911,6 +5911,6 @@@ void bond_setup(struct net_device *bond
        if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
                bond_dev->features |= BOND_XFRM_FEATURES;
  #endif /* CONFIG_XFRM_OFFLOAD */
-       if (bond_xdp_check(bond))
-               bond_dev->xdp_features = NETDEV_XDP_ACT_MASK;
  }
  
  /* Destroy a bonding device.
@@@ -6306,7 -6305,6 +6305,7 @@@ static int __init bond_check_params(str
        params->ad_actor_sys_prio = ad_actor_sys_prio;
        eth_zero_addr(params->ad_actor_system);
        params->ad_user_port_key = ad_user_port_key;
 +      params->coupled_control = 1;
        if (packets_per_slave > 0) {
                params->reciprocal_packets_per_slave =
                        reciprocal_value(packets_per_slave);
@@@ -6416,41 -6414,28 +6415,41 @@@ static int __net_init bond_net_init(str
        return 0;
  }
  
 -static void __net_exit bond_net_exit_batch(struct list_head *net_list)
 +/* According to commit 69b0216ac255 ("bonding: fix bonding_masters
 + * race condition in bond unloading") we need to remove sysfs files
 + * before we remove our devices (done later in bond_net_exit_batch_rtnl())
 + */
 +static void __net_exit bond_net_pre_exit(struct net *net)
 +{
 +      struct bond_net *bn = net_generic(net, bond_net_id);
 +
 +      bond_destroy_sysfs(bn);
 +}
 +
 +static void __net_exit bond_net_exit_batch_rtnl(struct list_head *net_list,
 +                                              struct list_head *dev_kill_list)
  {
        struct bond_net *bn;
        struct net *net;
 -      LIST_HEAD(list);
 -
 -      list_for_each_entry(net, net_list, exit_list) {
 -              bn = net_generic(net, bond_net_id);
 -              bond_destroy_sysfs(bn);
 -      }
  
        /* Kill off any bonds created after unregistering bond rtnl ops */
 -      rtnl_lock();
        list_for_each_entry(net, net_list, exit_list) {
                struct bonding *bond, *tmp_bond;
  
                bn = net_generic(net, bond_net_id);
                list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list)
 -                      unregister_netdevice_queue(bond->dev, &list);
 +                      unregister_netdevice_queue(bond->dev, dev_kill_list);
        }
 -      unregister_netdevice_many(&list);
 -      rtnl_unlock();
 +}
 +
 +/* According to commit 23fa5c2caae0 ("bonding: destroy proc directory
 + * only after all bonds are gone") bond_destroy_proc_dir() is called
 + * after bond_net_exit_batch_rtnl() has completed.
 + */
 +static void __net_exit bond_net_exit_batch(struct list_head *net_list)
 +{
 +      struct bond_net *bn;
 +      struct net *net;
  
        list_for_each_entry(net, net_list, exit_list) {
                bn = net_generic(net, bond_net_id);
  
  static struct pernet_operations bond_net_ops = {
        .init = bond_net_init,
 +      .pre_exit = bond_net_pre_exit,
 +      .exit_batch_rtnl = bond_net_exit_batch_rtnl,
        .exit_batch = bond_net_exit_batch,
        .id   = &bond_net_id,
        .size = sizeof(struct bond_net),
index 3a15f269c7d18f865f9acc79a5c28ea4258c6f07,f59557b0cd51523896890ffe6121ffbac54f5f70..0420f17e53a9643dfb6b9a299ca038743847ac62
@@@ -607,7 -607,6 +607,7 @@@ static void bcmasp_adj_link(struct net_
        struct phy_device *phydev = dev->phydev;
        u32 cmd_bits = 0, reg;
        int changed = 0;
 +      bool active;
  
        if (intf->old_link != phydev->link) {
                changed = 1;
                reg |= cmd_bits;
                umac_wl(intf, reg, UMC_CMD);
  
 -              intf->eee.eee_active = phy_init_eee(phydev, 0) >= 0;
 -              bcmasp_eee_enable_set(intf, intf->eee.eee_active);
 +              active = phy_init_eee(phydev, 0) >= 0;
 +              bcmasp_eee_enable_set(intf, active);
        }
  
        reg = rgmii_rl(intf, RGMII_OOB_CNTRL);
@@@ -685,6 -684,8 +685,8 @@@ static int bcmasp_init_rx(struct bcmasp
  
        intf->rx_buf_order = get_order(RING_BUFFER_SIZE);
        buffer_pg = alloc_pages(GFP_KERNEL, intf->rx_buf_order);
+       if (!buffer_pg)
+               return -ENOMEM;
  
        dma = dma_map_page(kdev, buffer_pg, 0, RING_BUFFER_SIZE,
                           DMA_FROM_DEVICE);
@@@ -1093,6 -1094,7 +1095,7 @@@ static int bcmasp_netif_init(struct net
        return 0;
  
  err_reclaim_tx:
+       netif_napi_del(&intf->tx_napi);
        bcmasp_reclaim_free_all_tx(intf);
  err_phy_disconnect:
        if (phydev)
index 8190d23a6a495c54931c183dbce15ee36868f868,e5d6156655ba48ee607c531e7c8951aa32d6e117..8ec0b97646491caf9a098dc1edf4931e6642e45f
@@@ -61,28 -61,6 +61,6 @@@ int rvu_npc_get_tx_nibble_cfg(struct rv
        return 0;
  }
  
- static int npc_mcam_verify_pf_func(struct rvu *rvu,
-                                  struct mcam_entry *entry_data, u8 intf,
-                                  u16 pcifunc)
- {
-       u16 pf_func, pf_func_mask;
-       if (is_npc_intf_rx(intf))
-               return 0;
-       pf_func_mask = (entry_data->kw_mask[0] >> 32) &
-               NPC_KEX_PF_FUNC_MASK;
-       pf_func = (entry_data->kw[0] >> 32) & NPC_KEX_PF_FUNC_MASK;
-       pf_func = be16_to_cpu((__force __be16)pf_func);
-       if (pf_func_mask != NPC_KEX_PF_FUNC_MASK ||
-           ((pf_func & ~RVU_PFVF_FUNC_MASK) !=
-            (pcifunc & ~RVU_PFVF_FUNC_MASK)))
-               return -EINVAL;
-       return 0;
- }
  void rvu_npc_set_pkind(struct rvu *rvu, int pkind, struct rvu_pfvf *pfvf)
  {
        int blkaddr;
@@@ -417,7 -395,7 +395,7 @@@ static void npc_fixup_vf_rule(struct rv
        owner = mcam->entry2pfvf_map[index];
        target_func = (entry->action >> 4) & 0xffff;
        /* do nothing when target is LBK/PF or owner is not PF */
 -      if (is_pffunc_af(owner) || is_afvf(target_func) ||
 +      if (is_pffunc_af(owner) || is_lbk_vf(rvu, target_func) ||
            (owner & RVU_PFVF_FUNC_MASK) ||
            !(target_func & RVU_PFVF_FUNC_MASK))
                return;
@@@ -626,7 -604,7 +604,7 @@@ void rvu_npc_install_ucast_entry(struc
        int blkaddr, index;
  
        /* AF's and SDP VFs work in promiscuous mode */
 -      if (is_afvf(pcifunc) || is_sdp_vf(pcifunc))
 +      if (is_lbk_vf(rvu, pcifunc) || is_sdp_vf(rvu, pcifunc))
                return;
  
        blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
@@@ -791,7 -769,7 +769,7 @@@ void rvu_npc_install_bcast_match_entry(
                return;
  
        /* Skip LBK VFs */
 -      if (is_afvf(pcifunc))
 +      if (is_lbk_vf(rvu, pcifunc))
                return;
  
        /* If pkt replication is not supported,
@@@ -871,7 -849,7 +849,7 @@@ void rvu_npc_install_allmulti_entry(str
        u16 vf_func;
  
        /* Only CGX PF/VF can add allmulticast entry */
 -      if (is_afvf(pcifunc) && is_sdp_vf(pcifunc))
 +      if (is_lbk_vf(rvu, pcifunc) && is_sdp_vf(rvu, pcifunc))
                return;
  
        blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
@@@ -2851,12 -2829,6 +2829,6 @@@ int rvu_mbox_handler_npc_mcam_write_ent
        else
                nix_intf = pfvf->nix_rx_intf;
  
-       if (!is_pffunc_af(pcifunc) &&
-           npc_mcam_verify_pf_func(rvu, &req->entry_data, req->intf, pcifunc)) {
-               rc = NPC_MCAM_INVALID_REQ;
-               goto exit;
-       }
        /* For AF installed rules, the nix_intf should be set to target NIX */
        if (is_pffunc_af(req->hdr.pcifunc))
                nix_intf = req->intf;
@@@ -3208,10 -3180,6 +3180,6 @@@ int rvu_mbox_handler_npc_mcam_alloc_and
        if (!is_npc_interface_valid(rvu, req->intf))
                return NPC_MCAM_INVALID_REQ;
  
-       if (npc_mcam_verify_pf_func(rvu, &req->entry_data, req->intf,
-                                   req->hdr.pcifunc))
-               return NPC_MCAM_INVALID_REQ;
        /* Try to allocate a MCAM entry */
        entry_req.hdr.pcifunc = req->hdr.pcifunc;
        entry_req.contig = true;
index 4ad3d2d3d4c8e7a24d5e74f7c76a43a37698ac43,928bf24d4b123945afc9df29ea5d758792d269cb..c9c7fddb246f83ecefd0d437506e264392d019cd
@@@ -41,7 -41,6 +41,7 @@@ struct mlx5_dpll_synce_status 
        enum mlx5_msees_oper_status oper_status;
        bool ho_acq;
        bool oper_freq_measure;
 +      enum mlx5_msees_failure_reason failure_reason;
        s32 frequency_diff;
  };
  
@@@ -61,7 -60,6 +61,7 @@@ mlx5_dpll_synce_status_get(struct mlx5_
        synce_status->oper_status = MLX5_GET(msees_reg, out, oper_status);
        synce_status->ho_acq = MLX5_GET(msees_reg, out, ho_acq);
        synce_status->oper_freq_measure = MLX5_GET(msees_reg, out, oper_freq_measure);
 +      synce_status->failure_reason = MLX5_GET(msees_reg, out, failure_reason);
        synce_status->frequency_diff = MLX5_GET(msees_reg, out, frequency_diff);
        return 0;
  }
@@@ -101,26 -99,6 +101,26 @@@ mlx5_dpll_lock_status_get(struct mlx5_d
        }
  }
  
 +static enum dpll_lock_status_error
 +mlx5_dpll_lock_status_error_get(struct mlx5_dpll_synce_status *synce_status)
 +{
 +      switch (synce_status->oper_status) {
 +      case MLX5_MSEES_OPER_STATUS_FAIL_HOLDOVER:
 +              fallthrough;
 +      case MLX5_MSEES_OPER_STATUS_FAIL_FREE_RUNNING:
 +              switch (synce_status->failure_reason) {
 +              case MLX5_MSEES_FAILURE_REASON_PORT_DOWN:
 +                      return DPLL_LOCK_STATUS_ERROR_MEDIA_DOWN;
 +              case MLX5_MSEES_FAILURE_REASON_TOO_HIGH_FREQUENCY_DIFF:
 +                      return DPLL_LOCK_STATUS_ERROR_FRACTIONAL_FREQUENCY_OFFSET_TOO_HIGH;
 +              default:
 +                      return DPLL_LOCK_STATUS_ERROR_UNDEFINED;
 +              }
 +      default:
 +              return DPLL_LOCK_STATUS_ERROR_NONE;
 +      }
 +}
 +
  static enum dpll_pin_state
  mlx5_dpll_pin_state_get(struct mlx5_dpll_synce_status *synce_status)
  {
@@@ -140,11 -118,10 +140,11 @@@ mlx5_dpll_pin_ffo_get(struct mlx5_dpll_
        return 0;
  }
  
 -static int mlx5_dpll_device_lock_status_get(const struct dpll_device *dpll,
 -                                          void *priv,
 -                                          enum dpll_lock_status *status,
 -                                          struct netlink_ext_ack *extack)
 +static int
 +mlx5_dpll_device_lock_status_get(const struct dpll_device *dpll, void *priv,
 +                               enum dpll_lock_status *status,
 +                               enum dpll_lock_status_error *status_error,
 +                               struct netlink_ext_ack *extack)
  {
        struct mlx5_dpll_synce_status synce_status;
        struct mlx5_dpll *mdpll = priv;
        if (err)
                return err;
        *status = mlx5_dpll_lock_status_get(&synce_status);
 +      *status_error = mlx5_dpll_lock_status_error_get(&synce_status);
        return 0;
  }
  
@@@ -413,7 -389,7 +413,7 @@@ static void mlx5_dpll_remove(struct aux
        struct mlx5_dpll *mdpll = auxiliary_get_drvdata(adev);
        struct mlx5_core_dev *mdev = mdpll->mdev;
  
-       cancel_delayed_work(&mdpll->work);
+       cancel_delayed_work_sync(&mdpll->work);
        mlx5_dpll_mdev_netdev_untrack(mdpll, mdev);
        destroy_workqueue(mdpll->wq);
        dpll_pin_unregister(mdpll->dpll, mdpll->dpll_pin,
index f9a1e9038dbf16717b8ac3b813bee7d35f48b79a,f7566cfa45ca37a3cfd02331c24f49bf576393a7..529670852bd6dc9857dad768248c73879101bbfc
@@@ -29,7 -29,6 +29,7 @@@
  #include <linux/spinlock.h>
  #include <linux/reset.h>
  #include <linux/math64.h>
 +#include <net/ip.h>
  
  #include "ravb.h"
  
                 NETIF_MSG_RX_ERR | \
                 NETIF_MSG_TX_ERR)
  
 -static const char *ravb_rx_irqs[NUM_RX_QUEUE] = {
 -      "ch0", /* RAVB_BE */
 -      "ch1", /* RAVB_NC */
 -};
 -
 -static const char *ravb_tx_irqs[NUM_TX_QUEUE] = {
 -      "ch18", /* RAVB_BE */
 -      "ch19", /* RAVB_NC */
 -};
 -
  void ravb_modify(struct net_device *ndev, enum ravb_reg reg, u32 clear,
                 u32 set)
  {
@@@ -87,13 -96,13 +87,13 @@@ static void ravb_set_rate_gbeth(struct 
        struct ravb_private *priv = netdev_priv(ndev);
  
        switch (priv->speed) {
 -      case 10:                /* 10BASE */
 +      case 10:                /* 10BASE */
                ravb_write(ndev, GBETH_GECMR_SPEED_10, GECMR);
                break;
 -      case 100:               /* 100BASE */
 +      case 100:               /* 100BASE */
                ravb_write(ndev, GBETH_GECMR_SPEED_100, GECMR);
                break;
 -      case 1000:              /* 1000BASE */
 +      case 1000:              /* 1000BASE */
                ravb_write(ndev, GBETH_GECMR_SPEED_1000, GECMR);
                break;
        }
@@@ -513,36 -522,6 +513,36 @@@ error
        return -ENOMEM;
  }
  
 +static void ravb_csum_init_gbeth(struct net_device *ndev)
 +{
 +      bool tx_enable = ndev->features & NETIF_F_HW_CSUM;
 +      bool rx_enable = ndev->features & NETIF_F_RXCSUM;
 +
 +      if (!(tx_enable || rx_enable))
 +              goto done;
 +
 +      ravb_write(ndev, 0, CSR0);
 +      if (ravb_wait(ndev, CSR0, CSR0_TPE | CSR0_RPE, 0)) {
 +              netdev_err(ndev, "Timeout enabling hardware checksum\n");
 +
 +              if (tx_enable)
 +                      ndev->features &= ~NETIF_F_HW_CSUM;
 +
 +              if (rx_enable)
 +                      ndev->features &= ~NETIF_F_RXCSUM;
 +      } else {
 +              if (tx_enable)
 +                      ravb_write(ndev, CSR1_TIP4 | CSR1_TTCP4 | CSR1_TUDP4, CSR1);
 +
 +              if (rx_enable)
 +                      ravb_write(ndev, CSR2_RIP4 | CSR2_RTCP4 | CSR2_RUDP4 | CSR2_RICMP4,
 +                                 CSR2);
 +      }
 +
 +done:
 +      ravb_write(ndev, CSR0_TPE | CSR0_RPE, CSR0);
 +}
 +
  static void ravb_emac_init_gbeth(struct net_device *ndev)
  {
        struct ravb_private *priv = netdev_priv(ndev);
  
        /* E-MAC status register clear */
        ravb_write(ndev, ECSR_ICD | ECSR_LCHNG | ECSR_PFRI, ECSR);
 -      ravb_write(ndev, CSR0_TPE | CSR0_RPE, CSR0);
 +
 +      ravb_csum_init_gbeth(ndev);
  
        /* E-MAC interrupt enable register */
        ravb_write(ndev, ECSIPR_ICDIP, ECSIPR);
@@@ -756,30 -734,6 +756,30 @@@ static void ravb_get_tx_tstamp(struct n
        }
  }
  
 +static void ravb_rx_csum_gbeth(struct sk_buff *skb)
 +{
 +      __wsum csum_ip_hdr, csum_proto;
 +      u8 *hw_csum;
 +
 +      /* The hardware checksum status is contained in sizeof(__sum16) * 2 = 4
 +       * bytes appended to packet data. First 2 bytes is ip header checksum
 +       * and last 2 bytes is protocol checksum.
 +       */
 +      if (unlikely(skb->len < sizeof(__sum16) * 2))
 +              return;
 +
 +      hw_csum = skb_tail_pointer(skb) - sizeof(__sum16);
 +      csum_proto = csum_unfold((__force __sum16)get_unaligned_le16(hw_csum));
 +
 +      hw_csum -= sizeof(__sum16);
 +      csum_ip_hdr = csum_unfold((__force __sum16)get_unaligned_le16(hw_csum));
 +      skb_trim(skb, skb->len - 2 * sizeof(__sum16));
 +
 +      /* TODO: IPV6 Rx checksum */
 +      if (skb->protocol == htons(ETH_P_IP) && !csum_ip_hdr && !csum_proto)
 +              skb->ip_summed = CHECKSUM_UNNECESSARY;
 +}
 +
  static void ravb_rx_csum(struct sk_buff *skb)
  {
        u8 *hw_csum;
@@@ -818,29 -772,25 +818,25 @@@ static bool ravb_rx_gbeth(struct net_de
        struct ravb_rx_desc *desc;
        struct sk_buff *skb;
        dma_addr_t dma_addr;
+       int rx_packets = 0;
        u8  desc_status;
-       int boguscnt;
        u16 pkt_len;
        u8  die_dt;
        int entry;
        int limit;
+       int i;
  
        entry = priv->cur_rx[q] % priv->num_rx_ring[q];
-       boguscnt = priv->dirty_rx[q] + priv->num_rx_ring[q] - priv->cur_rx[q];
+       limit = priv->dirty_rx[q] + priv->num_rx_ring[q] - priv->cur_rx[q];
        stats = &priv->stats[q];
  
-       boguscnt = min(boguscnt, *quota);
-       limit = boguscnt;
        desc = &priv->gbeth_rx_ring[entry];
-       while (desc->die_dt != DT_FEMPTY) {
+       for (i = 0; i < limit && rx_packets < *quota && desc->die_dt != DT_FEMPTY; i++) {
                /* Descriptor type must be checked before all other reads */
                dma_rmb();
                desc_status = desc->msc;
                pkt_len = le16_to_cpu(desc->ds_cc) & RX_DS;
  
-               if (--boguscnt < 0)
-                       break;
                /* We use 0-byte descriptors to mark the DMA mapping errors */
                if (!pkt_len)
                        continue;
                                skb = ravb_get_skb_gbeth(ndev, entry, desc);
                                skb_put(skb, pkt_len);
                                skb->protocol = eth_type_trans(skb, ndev);
 +                              if (ndev->features & NETIF_F_RXCSUM)
 +                                      ravb_rx_csum_gbeth(skb);
                                napi_gro_receive(&priv->napi[q], skb);
-                               stats->rx_packets++;
+                               rx_packets++;
                                stats->rx_bytes += pkt_len;
                                break;
                        case DT_FSTART:
                                dev_kfree_skb(skb);
                                priv->rx_1st_skb->protocol =
                                        eth_type_trans(priv->rx_1st_skb, ndev);
 +                              if (ndev->features & NETIF_F_RXCSUM)
 +                                      ravb_rx_csum_gbeth(skb);
                                napi_gro_receive(&priv->napi[q],
                                                 priv->rx_1st_skb);
-                               stats->rx_packets++;
+                               rx_packets++;
                                stats->rx_bytes += pkt_len;
                                break;
                        }
                desc->die_dt = DT_FEMPTY;
        }
  
-       *quota -= limit - (++boguscnt);
-       return boguscnt <= 0;
+       stats->rx_packets += rx_packets;
+       *quota -= rx_packets;
+       return *quota == 0;
  }
  
  /* Packet receive function for Ethernet AVB */
@@@ -1142,23 -1088,11 +1138,23 @@@ static irqreturn_t ravb_emac_interrupt(
  {
        struct net_device *ndev = dev_id;
        struct ravb_private *priv = netdev_priv(ndev);
 +      struct device *dev = &priv->pdev->dev;
 +      irqreturn_t result = IRQ_HANDLED;
 +
 +      pm_runtime_get_noresume(dev);
 +
 +      if (unlikely(!pm_runtime_active(dev))) {
 +              result = IRQ_NONE;
 +              goto out_rpm_put;
 +      }
  
        spin_lock(&priv->lock);
        ravb_emac_interrupt_unlocked(ndev);
        spin_unlock(&priv->lock);
 -      return IRQ_HANDLED;
 +
 +out_rpm_put:
 +      pm_runtime_put_noidle(dev);
 +      return result;
  }
  
  /* Error interrupt handler */
@@@ -1238,15 -1172,9 +1234,15 @@@ static irqreturn_t ravb_interrupt(int i
        struct net_device *ndev = dev_id;
        struct ravb_private *priv = netdev_priv(ndev);
        const struct ravb_hw_info *info = priv->info;
 +      struct device *dev = &priv->pdev->dev;
        irqreturn_t result = IRQ_NONE;
        u32 iss;
  
 +      pm_runtime_get_noresume(dev);
 +
 +      if (unlikely(!pm_runtime_active(dev)))
 +              goto out_rpm_put;
 +
        spin_lock(&priv->lock);
        /* Get interrupt status */
        iss = ravb_read(ndev, ISS);
        }
  
        spin_unlock(&priv->lock);
 +
 +out_rpm_put:
 +      pm_runtime_put_noidle(dev);
        return result;
  }
  
@@@ -1301,15 -1226,9 +1297,15 @@@ static irqreturn_t ravb_multi_interrupt
  {
        struct net_device *ndev = dev_id;
        struct ravb_private *priv = netdev_priv(ndev);
 +      struct device *dev = &priv->pdev->dev;
        irqreturn_t result = IRQ_NONE;
        u32 iss;
  
 +      pm_runtime_get_noresume(dev);
 +
 +      if (unlikely(!pm_runtime_active(dev)))
 +              goto out_rpm_put;
 +
        spin_lock(&priv->lock);
        /* Get interrupt status */
        iss = ravb_read(ndev, ISS);
        }
  
        spin_unlock(&priv->lock);
 +
 +out_rpm_put:
 +      pm_runtime_put_noidle(dev);
        return result;
  }
  
@@@ -1341,14 -1257,8 +1337,14 @@@ static irqreturn_t ravb_dma_interrupt(i
  {
        struct net_device *ndev = dev_id;
        struct ravb_private *priv = netdev_priv(ndev);
 +      struct device *dev = &priv->pdev->dev;
        irqreturn_t result = IRQ_NONE;
  
 +      pm_runtime_get_noresume(dev);
 +
 +      if (unlikely(!pm_runtime_active(dev)))
 +              goto out_rpm_put;
 +
        spin_lock(&priv->lock);
  
        /* Network control/Best effort queue RX/TX */
                result = IRQ_HANDLED;
  
        spin_unlock(&priv->lock);
 +
 +out_rpm_put:
 +      pm_runtime_put_noidle(dev);
        return result;
  }
  
@@@ -1377,16 -1284,25 +1373,16 @@@ static int ravb_poll(struct napi_struc
        struct net_device *ndev = napi->dev;
        struct ravb_private *priv = netdev_priv(ndev);
        const struct ravb_hw_info *info = priv->info;
 -      bool gptp = info->gptp || info->ccc_gac;
 -      struct ravb_rx_desc *desc;
        unsigned long flags;
        int q = napi - priv->napi;
        int mask = BIT(q);
        int quota = budget;
 -      unsigned int entry;
  
 -      if (!gptp) {
 -              entry = priv->cur_rx[q] % priv->num_rx_ring[q];
 -              desc = &priv->gbeth_rx_ring[entry];
 -      }
        /* Processing RX Descriptor Ring */
        /* Clear RX interrupt */
        ravb_write(ndev, ~(mask | RIS0_RESERVED), RIS0);
 -      if (gptp || desc->die_dt != DT_FEMPTY) {
 -              if (ravb_rx(ndev, &quota, q))
 -                      goto out;
 -      }
 +      if (ravb_rx(ndev, &quota, q))
 +              goto out;
  
        /* Processing TX Descriptor Ring */
        spin_lock_irqsave(&priv->lock, flags);
@@@ -1816,154 -1732,89 +1812,154 @@@ static const struct ethtool_ops ravb_et
        .set_wol                = ravb_set_wol,
  };
  
 -static inline int ravb_hook_irq(unsigned int irq, irq_handler_t handler,
 -                              struct net_device *ndev, struct device *dev,
 -                              const char *ch)
 +static int ravb_set_config_mode(struct net_device *ndev)
  {
 -      char *name;
 +      struct ravb_private *priv = netdev_priv(ndev);
 +      const struct ravb_hw_info *info = priv->info;
        int error;
  
 -      name = devm_kasprintf(dev, GFP_KERNEL, "%s:%s", ndev->name, ch);
 -      if (!name)
 -              return -ENOMEM;
 -      error = request_irq(irq, handler, 0, name, ndev);
 -      if (error)
 -              netdev_err(ndev, "cannot request IRQ %s\n", name);
 +      if (info->gptp) {
 +              error = ravb_set_opmode(ndev, CCC_OPC_CONFIG);
 +              if (error)
 +                      return error;
 +              /* Set CSEL value */
 +              ravb_modify(ndev, CCC, CCC_CSEL, CCC_CSEL_HPB);
 +      } else if (info->ccc_gac) {
 +              error = ravb_set_opmode(ndev, CCC_OPC_CONFIG | CCC_GAC | CCC_CSEL_HPB);
 +      } else {
 +              error = ravb_set_opmode(ndev, CCC_OPC_CONFIG);
 +      }
  
        return error;
  }
  
 +static void ravb_set_gti(struct net_device *ndev)
 +{
 +      struct ravb_private *priv = netdev_priv(ndev);
 +      const struct ravb_hw_info *info = priv->info;
 +
 +      if (!(info->gptp || info->ccc_gac))
 +              return;
 +
 +      ravb_write(ndev, priv->gti_tiv, GTI);
 +
 +      /* Request GTI loading */
 +      ravb_modify(ndev, GCCR, GCCR_LTI, GCCR_LTI);
 +}
 +
 +static int ravb_compute_gti(struct net_device *ndev)
 +{
 +      struct ravb_private *priv = netdev_priv(ndev);
 +      const struct ravb_hw_info *info = priv->info;
 +      struct device *dev = ndev->dev.parent;
 +      unsigned long rate;
 +      u64 inc;
 +
 +      if (!(info->gptp || info->ccc_gac))
 +              return 0;
 +
 +      if (info->gptp_ref_clk)
 +              rate = clk_get_rate(priv->gptp_clk);
 +      else
 +              rate = clk_get_rate(priv->clk);
 +      if (!rate)
 +              return -EINVAL;
 +
 +      inc = div64_ul(1000000000ULL << 20, rate);
 +
 +      if (inc < GTI_TIV_MIN || inc > GTI_TIV_MAX) {
 +              dev_err(dev, "gti.tiv increment 0x%llx is outside the range 0x%x - 0x%x\n",
 +                      inc, GTI_TIV_MIN, GTI_TIV_MAX);
 +              return -EINVAL;
 +      }
 +      priv->gti_tiv = inc;
 +
 +      return 0;
 +}
 +
 +/* Set tx and rx clock internal delay modes */
 +static void ravb_parse_delay_mode(struct device_node *np, struct net_device *ndev)
 +{
 +      struct ravb_private *priv = netdev_priv(ndev);
 +      bool explicit_delay = false;
 +      u32 delay;
 +
 +      if (!priv->info->internal_delay)
 +              return;
 +
 +      if (!of_property_read_u32(np, "rx-internal-delay-ps", &delay)) {
 +              /* Valid values are 0 and 1800, according to DT bindings */
 +              priv->rxcidm = !!delay;
 +              explicit_delay = true;
 +      }
 +      if (!of_property_read_u32(np, "tx-internal-delay-ps", &delay)) {
 +              /* Valid values are 0 and 2000, according to DT bindings */
 +              priv->txcidm = !!delay;
 +              explicit_delay = true;
 +      }
 +
 +      if (explicit_delay)
 +              return;
 +
 +      /* Fall back to legacy rgmii-*id behavior */
 +      if (priv->phy_interface == PHY_INTERFACE_MODE_RGMII_ID ||
 +          priv->phy_interface == PHY_INTERFACE_MODE_RGMII_RXID) {
 +              priv->rxcidm = 1;
 +              priv->rgmii_override = 1;
 +      }
 +
 +      if (priv->phy_interface == PHY_INTERFACE_MODE_RGMII_ID ||
 +          priv->phy_interface == PHY_INTERFACE_MODE_RGMII_TXID) {
 +              priv->txcidm = 1;
 +              priv->rgmii_override = 1;
 +      }
 +}
 +
 +static void ravb_set_delay_mode(struct net_device *ndev)
 +{
 +      struct ravb_private *priv = netdev_priv(ndev);
 +      u32 set = 0;
 +
 +      if (!priv->info->internal_delay)
 +              return;
 +
 +      if (priv->rxcidm)
 +              set |= APSR_RDM;
 +      if (priv->txcidm)
 +              set |= APSR_TDM;
 +      ravb_modify(ndev, APSR, APSR_RDM | APSR_TDM, set);
 +}
 +
  /* Network device open function for Ethernet AVB */
  static int ravb_open(struct net_device *ndev)
  {
        struct ravb_private *priv = netdev_priv(ndev);
        const struct ravb_hw_info *info = priv->info;
 -      struct platform_device *pdev = priv->pdev;
 -      struct device *dev = &pdev->dev;
        int error;
  
        napi_enable(&priv->napi[RAVB_BE]);
        if (info->nc_queues)
                napi_enable(&priv->napi[RAVB_NC]);
  
 -      if (!info->multi_irqs) {
 -              error = request_irq(ndev->irq, ravb_interrupt, IRQF_SHARED,
 -                                  ndev->name, ndev);
 -              if (error) {
 -                      netdev_err(ndev, "cannot request IRQ\n");
 -                      goto out_napi_off;
 -              }
 -      } else {
 -              error = ravb_hook_irq(ndev->irq, ravb_multi_interrupt, ndev,
 -                                    dev, "ch22:multi");
 -              if (error)
 -                      goto out_napi_off;
 -              error = ravb_hook_irq(priv->emac_irq, ravb_emac_interrupt, ndev,
 -                                    dev, "ch24:emac");
 -              if (error)
 -                      goto out_free_irq;
 -              error = ravb_hook_irq(priv->rx_irqs[RAVB_BE], ravb_be_interrupt,
 -                                    ndev, dev, "ch0:rx_be");
 -              if (error)
 -                      goto out_free_irq_emac;
 -              error = ravb_hook_irq(priv->tx_irqs[RAVB_BE], ravb_be_interrupt,
 -                                    ndev, dev, "ch18:tx_be");
 -              if (error)
 -                      goto out_free_irq_be_rx;
 -              error = ravb_hook_irq(priv->rx_irqs[RAVB_NC], ravb_nc_interrupt,
 -                                    ndev, dev, "ch1:rx_nc");
 -              if (error)
 -                      goto out_free_irq_be_tx;
 -              error = ravb_hook_irq(priv->tx_irqs[RAVB_NC], ravb_nc_interrupt,
 -                                    ndev, dev, "ch19:tx_nc");
 -              if (error)
 -                      goto out_free_irq_nc_rx;
 -
 -              if (info->err_mgmt_irqs) {
 -                      error = ravb_hook_irq(priv->erra_irq, ravb_multi_interrupt,
 -                                            ndev, dev, "err_a");
 -                      if (error)
 -                              goto out_free_irq_nc_tx;
 -                      error = ravb_hook_irq(priv->mgmta_irq, ravb_multi_interrupt,
 -                                            ndev, dev, "mgmt_a");
 -                      if (error)
 -                              goto out_free_irq_erra;
 -              }
 -      }
 +      /* Set AVB config mode */
 +      error = ravb_set_config_mode(ndev);
 +      if (error)
 +              goto out_napi_off;
 +
 +      ravb_set_delay_mode(ndev);
 +      ravb_write(ndev, priv->desc_bat_dma, DBAT);
  
        /* Device init */
        error = ravb_dmac_init(ndev);
        if (error)
 -              goto out_free_irq_mgmta;
 +              goto out_set_reset;
 +
        ravb_emac_init(ndev);
  
 +      ravb_set_gti(ndev);
 +
        /* Initialise PTP Clock driver */
 -      if (info->gptp)
 +      if (info->gptp || info->ccc_gac)
                ravb_ptp_init(ndev, priv->pdev);
  
        /* PHY control start */
  
  out_ptp_stop:
        /* Stop PTP Clock driver */
 -      if (info->gptp)
 +      if (info->gptp || info->ccc_gac)
                ravb_ptp_stop(ndev);
        ravb_stop_dma(ndev);
 -out_free_irq_mgmta:
 -      if (!info->multi_irqs)
 -              goto out_free_irq;
 -      if (info->err_mgmt_irqs)
 -              free_irq(priv->mgmta_irq, ndev);
 -out_free_irq_erra:
 -      if (info->err_mgmt_irqs)
 -              free_irq(priv->erra_irq, ndev);
 -out_free_irq_nc_tx:
 -      free_irq(priv->tx_irqs[RAVB_NC], ndev);
 -out_free_irq_nc_rx:
 -      free_irq(priv->rx_irqs[RAVB_NC], ndev);
 -out_free_irq_be_tx:
 -      free_irq(priv->tx_irqs[RAVB_BE], ndev);
 -out_free_irq_be_rx:
 -      free_irq(priv->rx_irqs[RAVB_BE], ndev);
 -out_free_irq_emac:
 -      free_irq(priv->emac_irq, ndev);
 -out_free_irq:
 -      free_irq(ndev->irq, ndev);
 +out_set_reset:
 +      ravb_set_opmode(ndev, CCC_OPC_RESET);
  out_napi_off:
        if (info->nc_queues)
                napi_disable(&priv->napi[RAVB_NC]);
@@@ -2066,36 -1935,6 +2062,36 @@@ out_unlock
        rtnl_unlock();
  }
  
 +static bool ravb_can_tx_csum_gbeth(struct sk_buff *skb)
 +{
 +      struct iphdr *ip = ip_hdr(skb);
 +
 +      /* TODO: Need to add support for VLAN tag 802.1Q */
 +      if (skb_vlan_tag_present(skb))
 +              return false;
 +
 +      /* TODO: Need to add hardware checksum for IPv6 */
 +      if (skb->protocol != htons(ETH_P_IP))
 +              return false;
 +
 +      switch (ip->protocol) {
 +      case IPPROTO_TCP:
 +              break;
 +      case IPPROTO_UDP:
 +              /* If the checksum value in the UDP header field is 0, TOE does
 +               * not calculate checksum for UDP part of this frame as it is
 +               * optional function as per standards.
 +               */
 +              if (udp_hdr(skb)->check == 0)
 +                      return false;
 +              break;
 +      default:
 +              return false;
 +      }
 +
 +      return true;
 +}
 +
  /* Packet transmit function for Ethernet AVB */
  static netdev_tx_t ravb_start_xmit(struct sk_buff *skb, struct net_device *ndev)
  {
        u32 entry;
        u32 len;
  
 +      if (skb->ip_summed == CHECKSUM_PARTIAL && !ravb_can_tx_csum_gbeth(skb))
 +              skb_checksum_help(skb);
 +
        spin_lock_irqsave(&priv->lock, flags);
        if (priv->cur_tx[q] - priv->dirty_tx[q] > (priv->num_tx_ring[q] - 1) *
            num_tx_desc) {
@@@ -2322,7 -2158,7 +2318,7 @@@ static int ravb_close(struct net_devic
        ravb_write(ndev, 0, TIC);
  
        /* Stop PTP Clock driver */
 -      if (info->gptp)
 +      if (info->gptp || info->ccc_gac)
                ravb_ptp_stop(ndev);
  
        /* Set the config mode to stop the AVB-DMAC's processes */
  
        cancel_work_sync(&priv->work);
  
 -      if (info->multi_irqs) {
 -              free_irq(priv->tx_irqs[RAVB_NC], ndev);
 -              free_irq(priv->rx_irqs[RAVB_NC], ndev);
 -              free_irq(priv->tx_irqs[RAVB_BE], ndev);
 -              free_irq(priv->rx_irqs[RAVB_BE], ndev);
 -              free_irq(priv->emac_irq, ndev);
 -              if (info->err_mgmt_irqs) {
 -                      free_irq(priv->erra_irq, ndev);
 -                      free_irq(priv->mgmta_irq, ndev);
 -              }
 -      }
 -      free_irq(ndev->irq, ndev);
 -
        if (info->nc_queues)
                napi_disable(&priv->napi[RAVB_NC]);
        napi_disable(&priv->napi[RAVB_BE]);
        if (info->nc_queues)
                ravb_ring_free(ndev, RAVB_NC);
  
 -      return 0;
 +      /* Set reset mode. */
 +      return ravb_set_opmode(ndev, CCC_OPC_RESET);
  }
  
  static int ravb_hwtstamp_get(struct net_device *ndev, struct ifreq *req)
@@@ -2482,59 -2330,11 +2478,59 @@@ static void ravb_set_rx_csum(struct net
        spin_unlock_irqrestore(&priv->lock, flags);
  }
  
 +static int ravb_endisable_csum_gbeth(struct net_device *ndev, enum ravb_reg reg,
 +                                   u32 val, u32 mask)
 +{
 +      u32 csr0 = CSR0_TPE | CSR0_RPE;
 +      int ret;
 +
 +      ravb_write(ndev, csr0 & ~mask, CSR0);
 +      ret = ravb_wait(ndev, CSR0, mask, 0);
 +      if (!ret)
 +              ravb_write(ndev, val, reg);
 +
 +      ravb_write(ndev, csr0, CSR0);
 +
 +      return ret;
 +}
 +
  static int ravb_set_features_gbeth(struct net_device *ndev,
                                   netdev_features_t features)
  {
 -      /* Place holder */
 -      return 0;
 +      netdev_features_t changed = ndev->features ^ features;
 +      struct ravb_private *priv = netdev_priv(ndev);
 +      unsigned long flags;
 +      int ret = 0;
 +      u32 val;
 +
 +      spin_lock_irqsave(&priv->lock, flags);
 +      if (changed & NETIF_F_RXCSUM) {
 +              if (features & NETIF_F_RXCSUM)
 +                      val = CSR2_RIP4 | CSR2_RTCP4 | CSR2_RUDP4 | CSR2_RICMP4;
 +              else
 +                      val = 0;
 +
 +              ret = ravb_endisable_csum_gbeth(ndev, CSR2, val, CSR0_RPE);
 +              if (ret)
 +                      goto done;
 +      }
 +
 +      if (changed & NETIF_F_HW_CSUM) {
 +              if (features & NETIF_F_HW_CSUM)
 +                      val = CSR1_TIP4 | CSR1_TTCP4 | CSR1_TUDP4;
 +              else
 +                      val = 0;
 +
 +              ret = ravb_endisable_csum_gbeth(ndev, CSR1, val, CSR0_TPE);
 +              if (ret)
 +                      goto done;
 +      }
 +
 +      ndev->features = features;
 +done:
 +      spin_unlock_irqrestore(&priv->lock, flags);
 +
 +      return ret;
  }
  
  static int ravb_set_features_rcar(struct net_device *ndev,
@@@ -2714,8 -2514,6 +2710,8 @@@ static const struct ravb_hw_info gbeth_
        .emac_init = ravb_emac_init_gbeth,
        .gstrings_stats = ravb_gstrings_stats_gbeth,
        .gstrings_size = sizeof(ravb_gstrings_stats_gbeth),
 +      .net_hw_features = NETIF_F_RXCSUM | NETIF_F_HW_CSUM,
 +      .net_features = NETIF_F_RXCSUM | NETIF_F_HW_CSUM,
        .stats_len = ARRAY_SIZE(ravb_gstrings_stats_gbeth),
        .max_rx_len = ALIGN(GBETH_RX_BUFF_MAX, RAVB_ALIGN),
        .tccr_mask = TCCR_TSRQ0,
@@@ -2739,88 -2537,100 +2735,88 @@@ static const struct of_device_id ravb_m
  };
  MODULE_DEVICE_TABLE(of, ravb_match_table);
  
 -static int ravb_set_gti(struct net_device *ndev)
 +static int ravb_setup_irq(struct ravb_private *priv, const char *irq_name,
 +                        const char *ch, int *irq, irq_handler_t handler)
  {
 -      struct ravb_private *priv = netdev_priv(ndev);
 -      const struct ravb_hw_info *info = priv->info;
 -      struct device *dev = ndev->dev.parent;
 -      unsigned long rate;
 -      uint64_t inc;
 -
 -      if (info->gptp_ref_clk)
 -              rate = clk_get_rate(priv->gptp_clk);
 -      else
 -              rate = clk_get_rate(priv->clk);
 -      if (!rate)
 -              return -EINVAL;
 +      struct platform_device *pdev = priv->pdev;
 +      struct net_device *ndev = priv->ndev;
 +      struct device *dev = &pdev->dev;
 +      const char *dev_name;
 +      unsigned long flags;
 +      int error;
  
 -      inc = div64_ul(1000000000ULL << 20, rate);
 +      if (irq_name) {
 +              dev_name = devm_kasprintf(dev, GFP_KERNEL, "%s:%s", ndev->name, ch);
 +              if (!dev_name)
 +                      return -ENOMEM;
  
 -      if (inc < GTI_TIV_MIN || inc > GTI_TIV_MAX) {
 -              dev_err(dev, "gti.tiv increment 0x%llx is outside the range 0x%x - 0x%x\n",
 -                      inc, GTI_TIV_MIN, GTI_TIV_MAX);
 -              return -EINVAL;
 +              *irq = platform_get_irq_byname(pdev, irq_name);
 +              flags = 0;
 +      } else {
 +              dev_name = ndev->name;
 +              *irq = platform_get_irq(pdev, 0);
 +              flags = IRQF_SHARED;
        }
 +      if (*irq < 0)
 +              return *irq;
  
 -      ravb_write(ndev, inc, GTI);
 +      error = devm_request_irq(dev, *irq, handler, flags, dev_name, ndev);
 +      if (error)
 +              netdev_err(ndev, "cannot request IRQ %s\n", dev_name);
  
 -      return 0;
 +      return error;
  }
  
 -static int ravb_set_config_mode(struct net_device *ndev)
 +static int ravb_setup_irqs(struct ravb_private *priv)
  {
 -      struct ravb_private *priv = netdev_priv(ndev);
        const struct ravb_hw_info *info = priv->info;
 -      int error;
 +      struct net_device *ndev = priv->ndev;
 +      const char *irq_name, *emac_irq_name;
 +      int error, irq;
  
 -      if (info->gptp) {
 -              error = ravb_set_opmode(ndev, CCC_OPC_CONFIG);
 -              if (error)
 -                      return error;
 -              /* Set CSEL value */
 -              ravb_modify(ndev, CCC, CCC_CSEL, CCC_CSEL_HPB);
 -      } else if (info->ccc_gac) {
 -              error = ravb_set_opmode(ndev, CCC_OPC_CONFIG | CCC_GAC | CCC_CSEL_HPB);
 +      if (!info->multi_irqs)
 +              return ravb_setup_irq(priv, NULL, NULL, &ndev->irq, ravb_interrupt);
 +
 +      if (info->err_mgmt_irqs) {
 +              irq_name = "dia";
 +              emac_irq_name = "line3";
        } else {
 -              error = ravb_set_opmode(ndev, CCC_OPC_CONFIG);
 +              irq_name = "ch22";
 +              emac_irq_name = "ch24";
        }
  
 -      return error;
 -}
 -
 -/* Set tx and rx clock internal delay modes */
 -static void ravb_parse_delay_mode(struct device_node *np, struct net_device *ndev)
 -{
 -      struct ravb_private *priv = netdev_priv(ndev);
 -      bool explicit_delay = false;
 -      u32 delay;
 +      error = ravb_setup_irq(priv, irq_name, "ch22:multi", &ndev->irq, ravb_multi_interrupt);
 +      if (error)
 +              return error;
  
 -      if (!of_property_read_u32(np, "rx-internal-delay-ps", &delay)) {
 -              /* Valid values are 0 and 1800, according to DT bindings */
 -              priv->rxcidm = !!delay;
 -              explicit_delay = true;
 -      }
 -      if (!of_property_read_u32(np, "tx-internal-delay-ps", &delay)) {
 -              /* Valid values are 0 and 2000, according to DT bindings */
 -              priv->txcidm = !!delay;
 -              explicit_delay = true;
 -      }
 +      error = ravb_setup_irq(priv, emac_irq_name, "ch24:emac", &priv->emac_irq,
 +                             ravb_emac_interrupt);
 +      if (error)
 +              return error;
  
 -      if (explicit_delay)
 -              return;
 +      if (info->err_mgmt_irqs) {
 +              error = ravb_setup_irq(priv, "err_a", "err_a", &irq, ravb_multi_interrupt);
 +              if (error)
 +                      return error;
  
 -      /* Fall back to legacy rgmii-*id behavior */
 -      if (priv->phy_interface == PHY_INTERFACE_MODE_RGMII_ID ||
 -          priv->phy_interface == PHY_INTERFACE_MODE_RGMII_RXID) {
 -              priv->rxcidm = 1;
 -              priv->rgmii_override = 1;
 +              error = ravb_setup_irq(priv, "mgmt_a", "mgmt_a", &irq, ravb_multi_interrupt);
 +              if (error)
 +                      return error;
        }
  
 -      if (priv->phy_interface == PHY_INTERFACE_MODE_RGMII_ID ||
 -          priv->phy_interface == PHY_INTERFACE_MODE_RGMII_TXID) {
 -              priv->txcidm = 1;
 -              priv->rgmii_override = 1;
 -      }
 -}
 +      error = ravb_setup_irq(priv, "ch0", "ch0:rx_be", &irq, ravb_be_interrupt);
 +      if (error)
 +              return error;
  
 -static void ravb_set_delay_mode(struct net_device *ndev)
 -{
 -      struct ravb_private *priv = netdev_priv(ndev);
 -      u32 set = 0;
 +      error = ravb_setup_irq(priv, "ch1", "ch1:rx_nc", &irq, ravb_nc_interrupt);
 +      if (error)
 +              return error;
  
 -      if (priv->rxcidm)
 -              set |= APSR_RDM;
 -      if (priv->txcidm)
 -              set |= APSR_TDM;
 -      ravb_modify(ndev, APSR, APSR_RDM | APSR_TDM, set);
 +      error = ravb_setup_irq(priv, "ch18", "ch18:tx_be", &irq, ravb_be_interrupt);
 +      if (error)
 +              return error;
 +
 +      return ravb_setup_irq(priv, "ch19", "ch19:tx_nc", &irq, ravb_nc_interrupt);
  }
  
  static int ravb_probe(struct platform_device *pdev)
        struct reset_control *rstc;
        struct ravb_private *priv;
        struct net_device *ndev;
 -      int error, irq, q;
        struct resource *res;
 -      int i;
 +      int error, q;
  
        if (!np) {
                dev_err(&pdev->dev,
                return -EINVAL;
        }
  
 -      rstc = devm_reset_control_get_optional_exclusive(&pdev->dev, NULL);
 +      rstc = devm_reset_control_get_exclusive(&pdev->dev, NULL);
        if (IS_ERR(rstc))
                return dev_err_probe(&pdev->dev, PTR_ERR(rstc),
                                     "failed to get cpg reset\n");
        if (error)
                goto out_free_netdev;
  
 -      pm_runtime_enable(&pdev->dev);
 -      error = pm_runtime_resume_and_get(&pdev->dev);
 -      if (error < 0)
 -              goto out_rpm_disable;
 -
 -      if (info->multi_irqs) {
 -              if (info->err_mgmt_irqs)
 -                      irq = platform_get_irq_byname(pdev, "dia");
 -              else
 -                      irq = platform_get_irq_byname(pdev, "ch22");
 -      } else {
 -              irq = platform_get_irq(pdev, 0);
 -      }
 -      if (irq < 0) {
 -              error = irq;
 -              goto out_release;
 -      }
 -      ndev->irq = irq;
 -
        SET_NETDEV_DEV(ndev, &pdev->dev);
  
        priv = netdev_priv(ndev);
                priv->num_rx_ring[RAVB_NC] = NC_RX_RING_SIZE;
        }
  
 +      error = ravb_setup_irqs(priv);
 +      if (error)
 +              goto out_reset_assert;
 +
 +      priv->clk = devm_clk_get(&pdev->dev, NULL);
 +      if (IS_ERR(priv->clk)) {
 +              error = PTR_ERR(priv->clk);
 +              goto out_reset_assert;
 +      }
 +
 +      if (info->gptp_ref_clk) {
 +              priv->gptp_clk = devm_clk_get(&pdev->dev, "gptp");
 +              if (IS_ERR(priv->gptp_clk)) {
 +                      error = PTR_ERR(priv->gptp_clk);
 +                      goto out_reset_assert;
 +              }
 +      }
 +
 +      priv->refclk = devm_clk_get_optional(&pdev->dev, "refclk");
 +      if (IS_ERR(priv->refclk)) {
 +              error = PTR_ERR(priv->refclk);
 +              goto out_reset_assert;
 +      }
 +      clk_prepare(priv->refclk);
 +
 +      platform_set_drvdata(pdev, ndev);
 +      pm_runtime_enable(&pdev->dev);
 +      error = pm_runtime_resume_and_get(&pdev->dev);
 +      if (error < 0)
 +              goto out_rpm_disable;
 +
        priv->addr = devm_platform_get_and_ioremap_resource(pdev, 0, &res);
        if (IS_ERR(priv->addr)) {
                error = PTR_ERR(priv->addr);
 -              goto out_release;
 +              goto out_rpm_put;
        }
  
        /* The Ether-specific entries in the device structure. */
  
        error = of_get_phy_mode(np, &priv->phy_interface);
        if (error && error != -ENODEV)
 -              goto out_release;
 +              goto out_rpm_put;
  
        priv->no_avb_link = of_property_read_bool(np, "renesas,no-ether-link");
        priv->avb_link_active_low =
                of_property_read_bool(np, "renesas,ether-link-active-low");
  
 -      if (info->multi_irqs) {
 -              if (info->err_mgmt_irqs)
 -                      irq = platform_get_irq_byname(pdev, "line3");
 -              else
 -                      irq = platform_get_irq_byname(pdev, "ch24");
 -              if (irq < 0) {
 -                      error = irq;
 -                      goto out_release;
 -              }
 -              priv->emac_irq = irq;
 -              for (i = 0; i < NUM_RX_QUEUE; i++) {
 -                      irq = platform_get_irq_byname(pdev, ravb_rx_irqs[i]);
 -                      if (irq < 0) {
 -                              error = irq;
 -                              goto out_release;
 -                      }
 -                      priv->rx_irqs[i] = irq;
 -              }
 -              for (i = 0; i < NUM_TX_QUEUE; i++) {
 -                      irq = platform_get_irq_byname(pdev, ravb_tx_irqs[i]);
 -                      if (irq < 0) {
 -                              error = irq;
 -                              goto out_release;
 -                      }
 -                      priv->tx_irqs[i] = irq;
 -              }
 -
 -              if (info->err_mgmt_irqs) {
 -                      irq = platform_get_irq_byname(pdev, "err_a");
 -                      if (irq < 0) {
 -                              error = irq;
 -                              goto out_release;
 -                      }
 -                      priv->erra_irq = irq;
 -
 -                      irq = platform_get_irq_byname(pdev, "mgmt_a");
 -                      if (irq < 0) {
 -                              error = irq;
 -                              goto out_release;
 -                      }
 -                      priv->mgmta_irq = irq;
 -              }
 -      }
 -
 -      priv->clk = devm_clk_get(&pdev->dev, NULL);
 -      if (IS_ERR(priv->clk)) {
 -              error = PTR_ERR(priv->clk);
 -              goto out_release;
 -      }
 -
 -      priv->refclk = devm_clk_get_optional(&pdev->dev, "refclk");
 -      if (IS_ERR(priv->refclk)) {
 -              error = PTR_ERR(priv->refclk);
 -              goto out_release;
 -      }
 -      clk_prepare_enable(priv->refclk);
 -
 -      if (info->gptp_ref_clk) {
 -              priv->gptp_clk = devm_clk_get(&pdev->dev, "gptp");
 -              if (IS_ERR(priv->gptp_clk)) {
 -                      error = PTR_ERR(priv->gptp_clk);
 -                      goto out_disable_refclk;
 -              }
 -              clk_prepare_enable(priv->gptp_clk);
 -      }
 -
        ndev->max_mtu = info->rx_max_buf_size - (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN);
        ndev->min_mtu = ETH_MIN_MTU;
  
        ndev->netdev_ops = &ravb_netdev_ops;
        ndev->ethtool_ops = &ravb_ethtool_ops;
  
 -      /* Set AVB config mode */
 -      error = ravb_set_config_mode(ndev);
 +      error = ravb_compute_gti(ndev);
        if (error)
 -              goto out_disable_gptp_clk;
 -
 -      if (info->gptp || info->ccc_gac) {
 -              /* Set GTI value */
 -              error = ravb_set_gti(ndev);
 -              if (error)
 -                      goto out_disable_gptp_clk;
 +              goto out_rpm_put;
  
 -              /* Request GTI loading */
 -              ravb_modify(ndev, GCCR, GCCR_LTI, GCCR_LTI);
 -      }
 -
 -      if (info->internal_delay) {
 -              ravb_parse_delay_mode(np, ndev);
 -              ravb_set_delay_mode(ndev);
 -      }
 +      ravb_parse_delay_mode(np, ndev);
  
        /* Allocate descriptor base address table */
        priv->desc_bat_size = sizeof(struct ravb_desc) * DBAT_ENTRY_NUM;
                        "Cannot allocate desc base address table (size %d bytes)\n",
                        priv->desc_bat_size);
                error = -ENOMEM;
 -              goto out_disable_gptp_clk;
 +              goto out_rpm_put;
        }
        for (q = RAVB_BE; q < DBAT_ENTRY_NUM; q++)
                priv->desc_bat[q].die_dt = DT_EOS;
 -      ravb_write(ndev, priv->desc_bat_dma, DBAT);
  
        /* Initialise HW timestamp list */
        INIT_LIST_HEAD(&priv->ts_skb_list);
  
 -      /* Initialise PTP Clock driver */
 -      if (info->ccc_gac)
 -              ravb_ptp_init(ndev, pdev);
 -
        /* Debug message level */
        priv->msg_enable = RAVB_DEF_MSG_ENABLE;
  
 +      /* Set config mode as this is needed for PHY initialization. */
 +      error = ravb_set_opmode(ndev, CCC_OPC_CONFIG);
 +      if (error)
 +              goto out_rpm_put;
 +
        /* Read and set MAC address */
        ravb_read_mac_address(np, ndev);
        if (!is_valid_ether_addr(ndev->dev_addr)) {
        error = ravb_mdio_init(priv);
        if (error) {
                dev_err(&pdev->dev, "failed to initialize MDIO\n");
 -              goto out_dma_free;
 +              goto out_reset_mode;
        }
  
 +      /* Undo previous switch to config opmode. */
 +      error = ravb_set_opmode(ndev, CCC_OPC_RESET);
 +      if (error)
 +              goto out_mdio_release;
 +
        netif_napi_add(ndev, &priv->napi[RAVB_BE], ravb_poll);
        if (info->nc_queues)
                netif_napi_add(ndev, &priv->napi[RAVB_NC], ravb_poll);
        netdev_info(ndev, "Base address at %#x, %pM, IRQ %d.\n",
                    (u32)ndev->base_addr, ndev->dev_addr, ndev->irq);
  
 -      platform_set_drvdata(pdev, ndev);
 -
        return 0;
  
  out_napi_del:
                netif_napi_del(&priv->napi[RAVB_NC]);
  
        netif_napi_del(&priv->napi[RAVB_BE]);
 +out_mdio_release:
        ravb_mdio_release(priv);
 -out_dma_free:
 +out_reset_mode:
 +      ravb_set_opmode(ndev, CCC_OPC_RESET);
        dma_free_coherent(ndev->dev.parent, priv->desc_bat_size, priv->desc_bat,
                          priv->desc_bat_dma);
 -
 -      /* Stop PTP Clock driver */
 -      if (info->ccc_gac)
 -              ravb_ptp_stop(ndev);
 -out_disable_gptp_clk:
 -      clk_disable_unprepare(priv->gptp_clk);
 -out_disable_refclk:
 -      clk_disable_unprepare(priv->refclk);
 -out_release:
 +out_rpm_put:
        pm_runtime_put(&pdev->dev);
  out_rpm_disable:
        pm_runtime_disable(&pdev->dev);
 +      clk_unprepare(priv->refclk);
 +out_reset_assert:
        reset_control_assert(rstc);
  out_free_netdev:
        free_netdev(ndev);
@@@ -3041,12 -2921,20 +3037,12 @@@ static void ravb_remove(struct platform
  
        ravb_mdio_release(priv);
  
 -      /* Stop PTP Clock driver */
 -      if (info->ccc_gac)
 -              ravb_ptp_stop(ndev);
 -
        dma_free_coherent(ndev->dev.parent, priv->desc_bat_size, priv->desc_bat,
                          priv->desc_bat_dma);
  
 -      ravb_set_opmode(ndev, CCC_OPC_RESET);
 -
 -      clk_disable_unprepare(priv->gptp_clk);
 -      clk_disable_unprepare(priv->refclk);
 -
        pm_runtime_put_sync(&pdev->dev);
        pm_runtime_disable(&pdev->dev);
 +      clk_unprepare(priv->refclk);
        reset_control_assert(priv->rstc);
        free_netdev(ndev);
        platform_set_drvdata(pdev, NULL);
@@@ -3072,9 -2960,6 +3068,9 @@@ static int ravb_wol_setup(struct net_de
        /* Enable MagicPacket */
        ravb_modify(ndev, ECMR, ECMR_MPDE, ECMR_MPDE);
  
 +      if (priv->info->ccc_gac)
 +              ravb_ptp_stop(ndev);
 +
        return enable_irq_wake(priv->emac_irq);
  }
  
@@@ -3082,20 -2967,6 +3078,20 @@@ static int ravb_wol_restore(struct net_
  {
        struct ravb_private *priv = netdev_priv(ndev);
        const struct ravb_hw_info *info = priv->info;
 +      int error;
 +
 +      /* Set reset mode to rearm the WoL logic. */
 +      error = ravb_set_opmode(ndev, CCC_OPC_RESET);
 +      if (error)
 +              return error;
 +
 +      /* Set AVB config mode. */
 +      error = ravb_set_config_mode(ndev);
 +      if (error)
 +              return error;
 +
 +      if (priv->info->ccc_gac)
 +              ravb_ptp_init(ndev, priv->pdev);
  
        if (info->nc_queues)
                napi_enable(&priv->napi[RAVB_NC]);
        return disable_irq_wake(priv->emac_irq);
  }
  
 -static int __maybe_unused ravb_suspend(struct device *dev)
 +static int ravb_suspend(struct device *dev)
  {
        struct net_device *ndev = dev_get_drvdata(dev);
        struct ravb_private *priv = netdev_priv(ndev);
        int ret;
  
        if (!netif_running(ndev))
 -              return 0;
 +              goto reset_assert;
  
        netif_device_detach(ndev);
  
        if (priv->wol_enabled)
 -              ret = ravb_wol_setup(ndev);
 -      else
 -              ret = ravb_close(ndev);
 +              return ravb_wol_setup(ndev);
  
 -      if (priv->info->ccc_gac)
 -              ravb_ptp_stop(ndev);
 +      ret = ravb_close(ndev);
 +      if (ret)
 +              return ret;
  
 -      return ret;
 +reset_assert:
 +      return reset_control_assert(priv->rstc);
  }
  
 -static int __maybe_unused ravb_resume(struct device *dev)
 +static int ravb_resume(struct device *dev)
  {
        struct net_device *ndev = dev_get_drvdata(dev);
        struct ravb_private *priv = netdev_priv(ndev);
 -      const struct ravb_hw_info *info = priv->info;
 -      int ret = 0;
 -
 -      /* If WoL is enabled set reset mode to rearm the WoL logic */
 -      if (priv->wol_enabled) {
 -              ret = ravb_set_opmode(ndev, CCC_OPC_RESET);
 -              if (ret)
 -                      return ret;
 -      }
 -
 -      /* All register have been reset to default values.
 -       * Restore all registers which where setup at probe time and
 -       * reopen device if it was running before system suspended.
 -       */
 +      int ret;
  
 -      /* Set AVB config mode */
 -      ret = ravb_set_config_mode(ndev);
 +      ret = reset_control_deassert(priv->rstc);
        if (ret)
                return ret;
  
 -      if (info->gptp || info->ccc_gac) {
 -              /* Set GTI value */
 -              ret = ravb_set_gti(ndev);
 +      if (!netif_running(ndev))
 +              return 0;
 +
 +      /* If WoL is enabled restore the interface. */
 +      if (priv->wol_enabled) {
 +              ret = ravb_wol_restore(ndev);
                if (ret)
                        return ret;
 -
 -              /* Request GTI loading */
 -              ravb_modify(ndev, GCCR, GCCR_LTI, GCCR_LTI);
        }
  
 -      if (info->internal_delay)
 -              ravb_set_delay_mode(ndev);
 -
 -      /* Restore descriptor base address table */
 -      ravb_write(ndev, priv->desc_bat_dma, DBAT);
 +      /* Reopening the interface will restore the device to the working state. */
 +      ret = ravb_open(ndev);
 +      if (ret < 0)
 +              return ret;
  
 -      if (priv->info->ccc_gac)
 -              ravb_ptp_init(ndev, priv->pdev);
 -
 -      if (netif_running(ndev)) {
 -              if (priv->wol_enabled) {
 -                      ret = ravb_wol_restore(ndev);
 -                      if (ret)
 -                              return ret;
 -              }
 -              ret = ravb_open(ndev);
 -              if (ret < 0)
 -                      return ret;
 -              ravb_set_rx_mode(ndev);
 -              netif_device_attach(ndev);
 -      }
 +      ravb_set_rx_mode(ndev);
 +      netif_device_attach(ndev);
  
        return ret;
  }
  
 -static int __maybe_unused ravb_runtime_nop(struct device *dev)
 +static int ravb_runtime_suspend(struct device *dev)
  {
 -      /* Runtime PM callback shared between ->runtime_suspend()
 -       * and ->runtime_resume(). Simply returns success.
 -       *
 -       * This driver re-initializes all registers after
 -       * pm_runtime_get_sync() anyway so there is no need
 -       * to save and restore registers here.
 -       */
 +      struct net_device *ndev = dev_get_drvdata(dev);
 +      struct ravb_private *priv = netdev_priv(ndev);
 +
 +      clk_disable(priv->refclk);
 +
        return 0;
  }
  
 +static int ravb_runtime_resume(struct device *dev)
 +{
 +      struct net_device *ndev = dev_get_drvdata(dev);
 +      struct ravb_private *priv = netdev_priv(ndev);
 +
 +      return clk_enable(priv->refclk);
 +}
 +
  static const struct dev_pm_ops ravb_dev_pm_ops = {
 -      SET_SYSTEM_SLEEP_PM_OPS(ravb_suspend, ravb_resume)
 -      SET_RUNTIME_PM_OPS(ravb_runtime_nop, ravb_runtime_nop, NULL)
 +      SYSTEM_SLEEP_PM_OPS(ravb_suspend, ravb_resume)
 +      RUNTIME_PM_OPS(ravb_runtime_suspend, ravb_runtime_resume, NULL)
  };
  
  static struct platform_driver ravb_driver = {
        .remove_new     = ravb_remove,
        .driver = {
                .name   = "ravb",
 -              .pm     = &ravb_dev_pm_ops,
 +              .pm     = pm_ptr(&ravb_dev_pm_ops),
                .of_match_table = ravb_match_table,
        },
  };
index c541550b0e6e6c8628b4df2b6d47815a8907f2c5,ef7bfbb9849733fa7f1f097ba53a36a68cc3384b..f07c8374f29cb936fe11236fc63e06e741b1c965
@@@ -1062,7 -1062,7 +1062,7 @@@ struct xfrmdev_ops 
        bool    (*xdo_dev_offload_ok) (struct sk_buff *skb,
                                       struct xfrm_state *x);
        void    (*xdo_dev_state_advance_esn) (struct xfrm_state *x);
 -      void    (*xdo_dev_state_update_curlft) (struct xfrm_state *x);
 +      void    (*xdo_dev_state_update_stats) (struct xfrm_state *x);
        int     (*xdo_dev_policy_add) (struct xfrm_policy *x, struct netlink_ext_ack *extack);
        void    (*xdo_dev_policy_delete) (struct xfrm_policy *x);
        void    (*xdo_dev_policy_free) (struct xfrm_policy *x);
@@@ -1815,15 -1815,6 +1815,15 @@@ enum netdev_stat_type 
        NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */
  };
  
 +enum netdev_reg_state {
 +      NETREG_UNINITIALIZED = 0,
 +      NETREG_REGISTERED,      /* completed register_netdevice */
 +      NETREG_UNREGISTERING,   /* called unregister_netdevice */
 +      NETREG_UNREGISTERED,    /* completed unregister todo */
 +      NETREG_RELEASED,        /* called free_netdev */
 +      NETREG_DUMMY,           /* dummy device for NAPI poll */
 +};
 +
  /**
   *    struct net_device - The DEVICE structure.
   *
@@@ -2150,6 -2141,11 +2150,11 @@@ struct net_device 
  
        /* TXRX read-mostly hotpath */
        __cacheline_group_begin(net_device_read_txrx);
+       union {
+               struct pcpu_lstats __percpu             *lstats;
+               struct pcpu_sw_netstats __percpu        *tstats;
+               struct pcpu_dstats __percpu             *dstats;
+       };
        unsigned int            flags;
        unsigned short          hard_header_len;
        netdev_features_t       features;
        const struct tlsdev_ops *tlsdev_ops;
  #endif
  
 -      unsigned char           operstate;
 +      unsigned int            operstate;
        unsigned char           link_mode;
  
        unsigned char           if_port;
  
        struct list_head        link_watch_list;
  
 -      enum { NETREG_UNINITIALIZED=0,
 -             NETREG_REGISTERED,       /* completed register_netdevice */
 -             NETREG_UNREGISTERING,    /* called unregister_netdevice */
 -             NETREG_UNREGISTERED,     /* completed unregister todo */
 -             NETREG_RELEASED,         /* called free_netdev */
 -             NETREG_DUMMY,            /* dummy device for NAPI poll */
 -      } reg_state:8;
 +      u8 reg_state;
  
        bool dismantle;
  
        enum netdev_ml_priv_type        ml_priv_type;
  
        enum netdev_stat_type           pcpu_stat_type:8;
-       union {
-               struct pcpu_lstats __percpu             *lstats;
-               struct pcpu_sw_netstats __percpu        *tstats;
-               struct pcpu_dstats __percpu             *dstats;
-       };
  
  #if IS_ENABLED(CONFIG_GARP)
        struct garp_port __rcu  *garp_port;
@@@ -3077,6 -3074,8 +3077,6 @@@ int call_netdevice_notifiers(unsigned l
  int call_netdevice_notifiers_info(unsigned long val,
                                  struct netdev_notifier_info *info);
  
 -extern rwlock_t                               dev_base_lock;          /* Device list lock */
 -
  #define for_each_netdev(net, d)               \
                list_for_each_entry(d, &(net)->dev_base_head, dev_list)
  #define for_each_netdev_reverse(net, d)       \
@@@ -3199,7 -3198,7 +3199,7 @@@ static inline void unregister_netdevice
  int netdev_refcnt_read(const struct net_device *dev);
  void free_netdev(struct net_device *dev);
  void netdev_freemem(struct net_device *dev);
 -int init_dummy_netdev(struct net_device *dev);
 +void init_dummy_netdev(struct net_device *dev);
  
  struct net_device *netdev_get_xmit_slave(struct net_device *dev,
                                         struct sk_buff *skb,
@@@ -3959,7 -3958,7 +3959,7 @@@ static inline void dev_consume_skb_any(
  u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
                             struct bpf_prog *xdp_prog);
  void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
 -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
 +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb);
  int netif_rx(struct sk_buff *skb);
  int __netif_rx(struct sk_buff *skb);
  
@@@ -5255,9 -5254,7 +5255,9 @@@ static inline const char *netdev_name(c
  
  static inline const char *netdev_reg_state(const struct net_device *dev)
  {
 -      switch (dev->reg_state) {
 +      u8 reg_state = READ_ONCE(dev->reg_state);
 +
 +      switch (reg_state) {
        case NETREG_UNINITIALIZED: return " (uninitialized)";
        case NETREG_REGISTERED: return "";
        case NETREG_UNREGISTERING: return " (unregistering)";
        case NETREG_DUMMY: return " (dummy)";
        }
  
 -      WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, dev->reg_state);
 +      WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, reg_state);
        return " (unknown)";
  }
  
diff --combined net/core/dev.c
index d8dd293a7a279f6495b97ca99b0c1db702e6415d,73a0219730075e666c4f11f668a50dbf9f9afa97..cc9c2eda65aca62bbb1c08b936520936e51f596e
  #include <linux/prandom.h>
  #include <linux/once_lite.h>
  #include <net/netdev_rx_queue.h>
 +#include <net/page_pool/types.h>
 +#include <net/page_pool/helpers.h>
  
  #include "dev.h"
  #include "net-sysfs.h"
@@@ -168,6 -166,28 +168,6 @@@ static int call_netdevice_notifiers_ext
                                           struct net_device *dev,
                                           struct netlink_ext_ack *extack);
  
 -/*
 - * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 - * semaphore.
 - *
 - * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 - *
 - * Writers must hold the rtnl semaphore while they loop through the
 - * dev_base_head list, and hold dev_base_lock for writing when they do the
 - * actual updates.  This allows pure readers to access the list even
 - * while a writer is preparing to update it.
 - *
 - * To put it another way, dev_base_lock is held for writing only to
 - * protect against pure readers; the rtnl semaphore provides the
 - * protection against other writers.
 - *
 - * See, for example usages, register_netdevice() and
 - * unregister_netdevice(), which must be called with the rtnl
 - * semaphore held.
 - */
 -DEFINE_RWLOCK(dev_base_lock);
 -EXPORT_SYMBOL(dev_base_lock);
 -
  static DEFINE_MUTEX(ifalias_mutex);
  
  /* protects napi_hash addition/deletion and napi_gen_id */
@@@ -316,27 -336,18 +316,27 @@@ int netdev_name_node_alt_create(struct 
                return -ENOMEM;
        netdev_name_node_add(net, name_node);
        /* The node that holds dev->name acts as a head of per-device list. */
-       list_add_tail(&name_node->list, &dev->name_node->list);
+       list_add_tail_rcu(&name_node->list, &dev->name_node->list);
  
        return 0;
  }
  
 -static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
 +static void netdev_name_node_alt_free(struct rcu_head *head)
  {
 -      list_del(&name_node->list);
 +      struct netdev_name_node *name_node =
 +              container_of(head, struct netdev_name_node, rcu);
 +
        kfree(name_node->name);
        netdev_name_node_free(name_node);
  }
  
 +static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
 +{
 +      netdev_name_node_del(name_node);
 +      list_del(&name_node->list);
 +      call_rcu(&name_node->rcu, netdev_name_node_alt_free);
 +}
 +
  int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
  {
        struct netdev_name_node *name_node;
        if (name_node == dev->name_node || name_node->dev != dev)
                return -EINVAL;
  
 -      netdev_name_node_del(name_node);
 -      synchronize_rcu();
        __netdev_name_node_alt_destroy(name_node);
 -
        return 0;
  }
  
@@@ -359,10 -373,8 +359,10 @@@ static void netdev_name_node_alt_flush(
  {
        struct netdev_name_node *name_node, *tmp;
  
 -      list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
 -              __netdev_name_node_alt_destroy(name_node);
 +      list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
 +              list_del(&name_node->list);
 +              netdev_name_node_alt_free(&name_node->rcu);
 +      }
  }
  
  /* Device list insertion */
@@@ -373,10 -385,12 +373,10 @@@ static void list_netdevice(struct net_d
  
        ASSERT_RTNL();
  
 -      write_lock(&dev_base_lock);
        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
        netdev_name_node_add(net, dev->name_node);
        hlist_add_head_rcu(&dev->index_hlist,
                           dev_index_hash(net, dev->ifindex));
 -      write_unlock(&dev_base_lock);
  
        netdev_for_each_altname(dev, name_node)
                netdev_name_node_add(net, name_node);
  /* Device list removal
   * caller must respect a RCU grace period before freeing/reusing dev
   */
 -static void unlist_netdevice(struct net_device *dev, bool lock)
 +static void unlist_netdevice(struct net_device *dev)
  {
        struct netdev_name_node *name_node;
        struct net *net = dev_net(dev);
                netdev_name_node_del(name_node);
  
        /* Unlink dev from the device chain */
 -      if (lock)
 -              write_lock(&dev_base_lock);
        list_del_rcu(&dev->dev_list);
        netdev_name_node_del(dev->name_node);
        hlist_del_rcu(&dev->index_hlist);
 -      if (lock)
 -              write_unlock(&dev_base_lock);
  
        dev_base_seq_inc(dev_net(dev));
  }
@@@ -424,12 -442,6 +424,12 @@@ static RAW_NOTIFIER_HEAD(netdev_chain)
  DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
  EXPORT_PER_CPU_SYMBOL(softnet_data);
  
 +/* Page_pool has a lockless array/stack to alloc/recycle pages.
 + * PP consumers must pay attention to run APIs in the appropriate context
 + * (e.g. NAPI context).
 + */
 +static DEFINE_PER_CPU_ALIGNED(struct page_pool *, system_page_pool);
 +
  #ifdef CONFIG_LOCKDEP
  /*
   * register_netdevice() inits txq->_xmit_lock and sets lockdep class
@@@ -726,9 -738,9 +726,9 @@@ EXPORT_SYMBOL_GPL(dev_fill_forward_path
   *    @net: the applicable net namespace
   *    @name: name to find
   *
 - *    Find an interface by name. Must be called under RTNL semaphore
 - *    or @dev_base_lock. If the name is found a pointer to the device
 - *    is returned. If the name is not found then %NULL is returned. The
 + *    Find an interface by name. Must be called under RTNL semaphore.
 + *    If the name is found a pointer to the device is returned.
 + *    If the name is not found then %NULL is returned. The
   *    reference counters are not incremented so the caller must be
   *    careful with locks.
   */
@@@ -809,7 -821,8 +809,7 @@@ EXPORT_SYMBOL(netdev_get_by_name)
   *    Search for an interface by index. Returns %NULL if the device
   *    is not found or a pointer to the device. The device has not
   *    had its reference counter increased so the caller must be careful
 - *    about locking. The caller must hold either the RTNL semaphore
 - *    or @dev_base_lock.
 + *    about locking. The caller must hold the RTNL semaphore.
   */
  
  struct net_device *__dev_get_by_index(struct net *net, int ifindex)
@@@ -1199,13 -1212,13 +1199,13 @@@ int dev_change_name(struct net_device *
                            dev->flags & IFF_UP ? " (while UP)" : "");
  
        old_assign_type = dev->name_assign_type;
 -      dev->name_assign_type = NET_NAME_RENAMED;
 +      WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);
  
  rollback:
        ret = device_rename(&dev->dev, dev->name);
        if (ret) {
                memcpy(dev->name, oldname, IFNAMSIZ);
 -              dev->name_assign_type = old_assign_type;
 +              WRITE_ONCE(dev->name_assign_type, old_assign_type);
                up_write(&devnet_rename_sem);
                return ret;
        }
  
        netdev_adjacent_rename_links(dev, oldname);
  
 -      write_lock(&dev_base_lock);
        netdev_name_node_del(dev->name_node);
 -      write_unlock(&dev_base_lock);
  
 -      synchronize_rcu();
 +      synchronize_net();
  
 -      write_lock(&dev_base_lock);
        netdev_name_node_add(net, dev->name_node);
 -      write_unlock(&dev_base_lock);
  
        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
        ret = notifier_to_errno(ret);
                        down_write(&devnet_rename_sem);
                        memcpy(dev->name, oldname, IFNAMSIZ);
                        memcpy(oldname, newname, IFNAMSIZ);
 -                      dev->name_assign_type = old_assign_type;
 +                      WRITE_ONCE(dev->name_assign_type, old_assign_type);
                        old_assign_type = NET_NAME_RENAMED;
                        goto rollback;
                } else {
@@@ -4841,12 -4858,6 +4841,12 @@@ u32 bpf_prog_run_generic_xdp(struct sk_
        xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
        xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
                         skb_headlen(skb) + mac_len, true);
 +      if (skb_is_nonlinear(skb)) {
 +              skb_shinfo(skb)->xdp_frags_size = skb->data_len;
 +              xdp_buff_set_frags_flag(xdp);
 +      } else {
 +              xdp_buff_clear_frags_flag(xdp);
 +      }
  
        orig_data_end = xdp->data_end;
        orig_data = xdp->data;
                skb->len += off; /* positive on grow, negative on shrink */
        }
  
 +      /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
 +       * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
 +       */
 +      if (xdp_buff_has_frags(xdp))
 +              skb->data_len = skb_shinfo(skb)->xdp_frags_size;
 +      else
 +              skb->data_len = 0;
 +
        /* check if XDP changed eth hdr such SKB needs update */
        eth = (struct ethhdr *)xdp->data;
        if ((orig_eth_type != eth->h_proto) ||
        return act;
  }
  
 -static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 +static int
 +netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
 +{
 +      struct sk_buff *skb = *pskb;
 +      int err, hroom, troom;
 +
 +      if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
 +              return 0;
 +
 +      /* In case we have to go down the path and also linearize,
 +       * then lets do the pskb_expand_head() work just once here.
 +       */
 +      hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
 +      troom = skb->tail + skb->data_len - skb->end;
 +      err = pskb_expand_head(skb,
 +                             hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
 +                             troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
 +      if (err)
 +              return err;
 +
 +      return skb_linearize(skb);
 +}
 +
 +static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
                                     struct xdp_buff *xdp,
                                     struct bpf_prog *xdp_prog)
  {
 -      u32 act = XDP_DROP;
 +      struct sk_buff *skb = *pskb;
 +      u32 mac_len, act = XDP_DROP;
  
        /* Reinjected packets coming from act_mirred or similar should
         * not get XDP generic processing.
        if (skb_is_redirected(skb))
                return XDP_PASS;
  
 -      /* XDP packets must be linear and must have sufficient headroom
 -       * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
 -       * native XDP provides, thus we need to do it here as well.
 +      /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
 +       * bytes. This is the guarantee that also native XDP provides,
 +       * thus we need to do it here as well.
         */
 +      mac_len = skb->data - skb_mac_header(skb);
 +      __skb_push(skb, mac_len);
 +
        if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
            skb_headroom(skb) < XDP_PACKET_HEADROOM) {
 -              int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
 -              int troom = skb->tail + skb->data_len - skb->end;
 -
 -              /* In case we have to go down the path and also linearize,
 -               * then lets do the pskb_expand_head() work just once here.
 -               */
 -              if (pskb_expand_head(skb,
 -                                   hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
 -                                   troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
 -                      goto do_drop;
 -              if (skb_linearize(skb))
 +              if (netif_skb_check_for_xdp(pskb, xdp_prog))
                        goto do_drop;
        }
  
 -      act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
 +      __skb_pull(*pskb, mac_len);
 +
 +      act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
        switch (act) {
        case XDP_REDIRECT:
        case XDP_TX:
        case XDP_PASS:
                break;
        default:
 -              bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
 +              bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
                fallthrough;
        case XDP_ABORTED:
 -              trace_xdp_exception(skb->dev, xdp_prog, act);
 +              trace_xdp_exception((*pskb)->dev, xdp_prog, act);
                fallthrough;
        case XDP_DROP:
        do_drop:
 -              kfree_skb(skb);
 +              kfree_skb(*pskb);
                break;
        }
  
@@@ -5020,24 -5004,24 +5020,24 @@@ void generic_xdp_tx(struct sk_buff *skb
  
  static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
  
 -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
  {
        if (xdp_prog) {
                struct xdp_buff xdp;
                u32 act;
                int err;
  
 -              act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
 +              act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
                if (act != XDP_PASS) {
                        switch (act) {
                        case XDP_REDIRECT:
 -                              err = xdp_do_generic_redirect(skb->dev, skb,
 +                              err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
                                                              &xdp, xdp_prog);
                                if (err)
                                        goto out_redir;
                                break;
                        case XDP_TX:
 -                              generic_xdp_tx(skb, xdp_prog);
 +                              generic_xdp_tx(*pskb, xdp_prog);
                                break;
                        }
                        return XDP_DROP;
        }
        return XDP_PASS;
  out_redir:
 -      kfree_skb_reason(skb, SKB_DROP_REASON_XDP);
 +      kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
        return XDP_DROP;
  }
  EXPORT_SYMBOL_GPL(do_xdp_generic);
@@@ -5368,8 -5352,7 +5368,8 @@@ another_round
                int ret2;
  
                migrate_disable();
 -              ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
 +              ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
 +                                    &skb);
                migrate_enable();
  
                if (ret2 != XDP_PASS) {
@@@ -6194,13 -6177,8 +6194,13 @@@ static void __busy_poll_stop(struct nap
        clear_bit(NAPI_STATE_SCHED, &napi->state);
  }
  
 -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
 -                         u16 budget)
 +enum {
 +      NAPI_F_PREFER_BUSY_POLL = 1,
 +      NAPI_F_END_ON_RESCHED   = 2,
 +};
 +
 +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
 +                         unsigned flags, u16 budget)
  {
        bool skip_schedule = false;
        unsigned long timeout;
  
        local_bh_disable();
  
 -      if (prefer_busy_poll) {
 +      if (flags & NAPI_F_PREFER_BUSY_POLL) {
                napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
                timeout = READ_ONCE(napi->dev->gro_flush_timeout);
                if (napi->defer_hard_irqs_count && timeout) {
        local_bh_enable();
  }
  
 -void napi_busy_loop(unsigned int napi_id,
 -                  bool (*loop_end)(void *, unsigned long),
 -                  void *loop_end_arg, bool prefer_busy_poll, u16 budget)
 +static void __napi_busy_loop(unsigned int napi_id,
 +                    bool (*loop_end)(void *, unsigned long),
 +                    void *loop_end_arg, unsigned flags, u16 budget)
  {
        unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
        int (*napi_poll)(struct napi_struct *napi, int budget);
        void *have_poll_lock = NULL;
        struct napi_struct *napi;
  
 +      WARN_ON_ONCE(!rcu_read_lock_held());
 +
  restart:
        napi_poll = NULL;
  
 -      rcu_read_lock();
 -
        napi = napi_by_id(napi_id);
        if (!napi)
 -              goto out;
 +              return;
  
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_disable();
                         */
                        if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
                                   NAPIF_STATE_IN_BUSY_POLL)) {
 -                              if (prefer_busy_poll)
 +                              if (flags & NAPI_F_PREFER_BUSY_POLL)
                                        set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                                goto count;
                        }
                        if (cmpxchg(&napi->state, val,
                                    val | NAPIF_STATE_IN_BUSY_POLL |
                                          NAPIF_STATE_SCHED) != val) {
 -                              if (prefer_busy_poll)
 +                              if (flags & NAPI_F_PREFER_BUSY_POLL)
                                        set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                                goto count;
                        }
@@@ -6303,15 -6281,12 +6303,15 @@@ count
                        break;
  
                if (unlikely(need_resched())) {
 +                      if (flags & NAPI_F_END_ON_RESCHED)
 +                              break;
                        if (napi_poll)
 -                              busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
 +                              busy_poll_stop(napi, have_poll_lock, flags, budget);
                        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                                preempt_enable();
                        rcu_read_unlock();
                        cond_resched();
 +                      rcu_read_lock();
                        if (loop_end(loop_end_arg, start_time))
                                return;
                        goto restart;
                cpu_relax();
        }
        if (napi_poll)
 -              busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
 +              busy_poll_stop(napi, have_poll_lock, flags, budget);
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_enable();
 -out:
 +}
 +
 +void napi_busy_loop_rcu(unsigned int napi_id,
 +                      bool (*loop_end)(void *, unsigned long),
 +                      void *loop_end_arg, bool prefer_busy_poll, u16 budget)
 +{
 +      unsigned flags = NAPI_F_END_ON_RESCHED;
 +
 +      if (prefer_busy_poll)
 +              flags |= NAPI_F_PREFER_BUSY_POLL;
 +
 +      __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
 +}
 +
 +void napi_busy_loop(unsigned int napi_id,
 +                  bool (*loop_end)(void *, unsigned long),
 +                  void *loop_end_arg, bool prefer_busy_poll, u16 budget)
 +{
 +      unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;
 +
 +      rcu_read_lock();
 +      __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
        rcu_read_unlock();
  }
  EXPORT_SYMBOL(napi_busy_loop);
@@@ -8960,7 -8914,7 +8960,7 @@@ int dev_set_mac_address(struct net_devi
  }
  EXPORT_SYMBOL(dev_set_mac_address);
  
 -static DECLARE_RWSEM(dev_addr_sem);
 +DECLARE_RWSEM(dev_addr_sem);
  
  int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
                             struct netlink_ext_ack *extack)
@@@ -9736,11 -9690,11 +9736,11 @@@ static void dev_index_release(struct ne
  /* Delayed registration/unregisteration */
  LIST_HEAD(net_todo_list);
  DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 +atomic_t dev_unreg_count = ATOMIC_INIT(0);
  
  static void net_set_todo(struct net_device *dev)
  {
        list_add_tail(&dev->todo_list, &net_todo_list);
 -      atomic_inc(&dev_net(dev)->dev_unreg_count);
  }
  
  static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
@@@ -10305,9 -10259,9 +10305,9 @@@ int register_netdevice(struct net_devic
                goto err_ifindex_release;
  
        ret = netdev_register_kobject(dev);
 -      write_lock(&dev_base_lock);
 -      dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
 -      write_unlock(&dev_base_lock);
 +
 +      WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
 +
        if (ret)
                goto err_uninit_notify;
  
@@@ -10383,7 -10337,7 +10383,7 @@@ EXPORT_SYMBOL(register_netdevice)
   *    that need to tie several hardware interfaces to a single NAPI
   *    poll scheduler due to HW limitations.
   */
 -int init_dummy_netdev(struct net_device *dev)
 +void init_dummy_netdev(struct net_device *dev)
  {
        /* Clear everything. Note we don't initialize spinlocks
         * are they aren't supposed to be taken by any of the
         * because users of this 'device' dont need to change
         * its refcount.
         */
 -
 -      return 0;
  }
  EXPORT_SYMBOL_GPL(init_dummy_netdev);
  
@@@ -10565,7 -10521,6 +10565,7 @@@ void netdev_run_todo(void
  {
        struct net_device *dev, *tmp;
        struct list_head list;
 +      int cnt;
  #ifdef CONFIG_LOCKDEP
        struct list_head unlink_list;
  
                        continue;
                }
  
 -              write_lock(&dev_base_lock);
 -              dev->reg_state = NETREG_UNREGISTERED;
 -              write_unlock(&dev_base_lock);
 +              WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
                linkwatch_sync_dev(dev);
        }
  
 +      cnt = 0;
        while (!list_empty(&list)) {
                dev = netdev_wait_allrefs_any(&list);
                list_del(&dev->todo_list);
                if (dev->needs_free_netdev)
                        free_netdev(dev);
  
 -              if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
 -                      wake_up(&netdev_unregistering_wq);
 +              cnt++;
  
                /* Free network device */
                kobject_put(&dev->dev.kobj);
        }
 +      if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
 +              wake_up(&netdev_unregistering_wq);
  }
  
  /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
@@@ -11015,7 -10970,7 +11015,7 @@@ void free_netdev(struct net_device *dev
        }
  
        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
 -      dev->reg_state = NETREG_RELEASED;
 +      WRITE_ONCE(dev->reg_state, NETREG_RELEASED);
  
        /* will free via device release */
        put_device(&dev->dev);
@@@ -11071,7 -11026,6 +11071,7 @@@ void unregister_netdevice_many_notify(s
  {
        struct net_device *dev, *tmp;
        LIST_HEAD(close_head);
 +      int cnt = 0;
  
        BUG_ON(dev_boot_phase);
        ASSERT_RTNL();
  
        list_for_each_entry(dev, head, unreg_list) {
                /* And unlink it from device chain. */
 -              write_lock(&dev_base_lock);
 -              unlist_netdevice(dev, false);
 -              dev->reg_state = NETREG_UNREGISTERING;
 -              write_unlock(&dev_base_lock);
 +              unlist_netdevice(dev);
 +              WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
        }
        flush_all_backlogs();
  
        list_for_each_entry(dev, head, unreg_list) {
                netdev_put(dev, &dev->dev_registered_tracker);
                net_set_todo(dev);
 +              cnt++;
        }
 +      atomic_add(cnt, &dev_unreg_count);
  
        list_del(head);
  }
@@@ -11286,7 -11240,7 +11286,7 @@@ int __dev_change_net_namespace(struct n
        dev_close(dev);
  
        /* And unlink it from device chain */
 -      unlist_netdevice(dev, true);
 +      unlist_netdevice(dev);
  
        synchronize_net();
  
@@@ -11622,8 -11576,11 +11622,8 @@@ static void __net_exit default_device_e
                        snprintf(fb_name, IFNAMSIZ, "dev%%d");
  
                netdev_for_each_altname_safe(dev, name_node, tmp)
 -                      if (netdev_name_in_use(&init_net, name_node->name)) {
 -                              netdev_name_node_del(name_node);
 -                              synchronize_rcu();
 +                      if (netdev_name_in_use(&init_net, name_node->name))
                                __netdev_name_node_alt_destroy(name_node);
 -                      }
  
                err = dev_change_net_namespace(dev, &init_net, fb_name);
                if (err) {
@@@ -11695,11 -11652,12 +11695,12 @@@ static void __init net_dev_struct_check
        CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);
  
        /* TXRX read-mostly hotpath */
+       CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
-       CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 30);
+       CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 38);
  
        /* RX read-mostly hotpath */
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
   *
   */
  
 +/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
 +#define SYSTEM_PERCPU_PAGE_POOL_SIZE  ((1 << 20) / PAGE_SIZE)
 +
 +static int net_page_pool_create(int cpuid)
 +{
 +#if IS_ENABLED(CONFIG_PAGE_POOL)
 +      struct page_pool_params page_pool_params = {
 +              .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
 +              .nid = NUMA_NO_NODE,
 +      };
 +      struct page_pool *pp_ptr;
 +
 +      pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
 +      if (IS_ERR(pp_ptr))
 +              return -ENOMEM;
 +
 +      per_cpu(system_page_pool, cpuid) = pp_ptr;
 +#endif
 +      return 0;
 +}
 +
  /*
   *       This is called single threaded during boot, so no need
   *       to take the rtnl semaphore.
@@@ -11802,9 -11739,6 +11803,9 @@@ static int __init net_dev_init(void
                init_gro_hash(&sd->backlog);
                sd->backlog.poll = process_backlog;
                sd->backlog.weight = weight_p;
 +
 +              if (net_page_pool_create(i))
 +                      goto out;
        }
  
        dev_boot_phase = 0;
        WARN_ON(rc < 0);
        rc = 0;
  out:
 +      if (rc < 0) {
 +              for_each_possible_cpu(i) {
 +                      struct page_pool *pp_ptr;
 +
 +                      pp_ptr = per_cpu(system_page_pool, i);
 +                      if (!pp_ptr)
 +                              continue;
 +
 +                      page_pool_destroy(pp_ptr);
 +                      per_cpu(system_page_pool, i) = NULL;
 +              }
 +      }
 +
        return rc;
  }
  
diff --combined net/core/rtnetlink.c
index 39e66bf3e2384eb8e533441301fa950e77291d2c,9c4f427f3a5057b52ec05405e8b15b8ca2246b4b..c54dbe05c4c5df126d0b58403049ebc1d272907e
@@@ -483,15 -483,24 +483,15 @@@ EXPORT_SYMBOL_GPL(__rtnl_link_unregiste
   */
  static void rtnl_lock_unregistering_all(void)
  {
 -      struct net *net;
 -      bool unregistering;
        DEFINE_WAIT_FUNC(wait, woken_wake_function);
  
        add_wait_queue(&netdev_unregistering_wq, &wait);
        for (;;) {
 -              unregistering = false;
                rtnl_lock();
                /* We held write locked pernet_ops_rwsem, and parallel
                 * setup_net() and cleanup_net() are not possible.
                 */
 -              for_each_net(net) {
 -                      if (atomic_read(&net->dev_unreg_count) > 0) {
 -                              unregistering = true;
 -                              break;
 -                      }
 -              }
 -              if (!unregistering)
 +              if (!atomic_read(&dev_unreg_count))
                        break;
                __rtnl_unlock();
  
@@@ -842,22 -851,9 +842,22 @@@ int rtnl_put_cacheinfo(struct sk_buff *
  }
  EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);
  
 +void netdev_set_operstate(struct net_device *dev, int newstate)
 +{
 +      unsigned int old = READ_ONCE(dev->operstate);
 +
 +      do {
 +              if (old == newstate)
 +                      return;
 +      } while (!try_cmpxchg(&dev->operstate, &old, newstate));
 +
 +      netdev_state_change(dev);
 +}
 +EXPORT_SYMBOL(netdev_set_operstate);
 +
  static void set_operstate(struct net_device *dev, unsigned char transition)
  {
 -      unsigned char operstate = dev->operstate;
 +      unsigned char operstate = READ_ONCE(dev->operstate);
  
        switch (transition) {
        case IF_OPER_UP:
                break;
        }
  
 -      if (dev->operstate != operstate) {
 -              write_lock(&dev_base_lock);
 -              dev->operstate = operstate;
 -              write_unlock(&dev_base_lock);
 -              netdev_state_change(dev);
 -      }
 +      netdev_set_operstate(dev, operstate);
  }
  
  static unsigned int rtnl_dev_get_flags(const struct net_device *dev)
@@@ -1019,14 -1020,17 +1019,17 @@@ static size_t rtnl_xdp_size(void
  static size_t rtnl_prop_list_size(const struct net_device *dev)
  {
        struct netdev_name_node *name_node;
-       size_t size;
+       unsigned int cnt = 0;
+       rcu_read_lock();
+       list_for_each_entry_rcu(name_node, &dev->name_node->list, list)
+               cnt++;
+       rcu_read_unlock();
  
-       if (list_empty(&dev->name_node->list))
+       if (!cnt)
                return 0;
-       size = nla_total_size(0);
-       list_for_each_entry(name_node, &dev->name_node->list, list)
-               size += nla_total_size(ALTIFNAMSIZ);
-       return size;
+       return nla_total_size(0) + cnt * nla_total_size(ALTIFNAMSIZ);
  }
  
  static size_t rtnl_proto_down_size(const struct net_device *dev)
@@@ -2196,22 -2200,25 +2199,22 @@@ static int rtnl_valid_dump_ifinfo_req(c
  
  static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
  {
 +      const struct rtnl_link_ops *kind_ops = NULL;
        struct netlink_ext_ack *extack = cb->extack;
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
 -      struct net *tgt_net = net;
 -      int h, s_h;
 -      int idx = 0, s_idx;
 -      struct net_device *dev;
 -      struct hlist_head *head;
 +      unsigned int flags = NLM_F_MULTI;
        struct nlattr *tb[IFLA_MAX+1];
 +      struct {
 +              unsigned long ifindex;
 +      } *ctx = (void *)cb->ctx;
 +      struct net *tgt_net = net;
        u32 ext_filter_mask = 0;
 -      const struct rtnl_link_ops *kind_ops = NULL;
 -      unsigned int flags = NLM_F_MULTI;
 +      struct net_device *dev;
        int master_idx = 0;
        int netnsid = -1;
        int err, i;
  
 -      s_h = cb->args[0];
 -      s_idx = cb->args[1];
 -
        err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack);
        if (err < 0) {
                if (cb->strict_check)
                flags |= NLM_F_DUMP_FILTERED;
  
  walk_entries:
 -      for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
 -              idx = 0;
 -              head = &tgt_net->dev_index_head[h];
 -              hlist_for_each_entry(dev, head, index_hlist) {
 -                      if (link_dump_filtered(dev, master_idx, kind_ops))
 -                              goto cont;
 -                      if (idx < s_idx)
 -                              goto cont;
 -                      err = rtnl_fill_ifinfo(skb, dev, net,
 -                                             RTM_NEWLINK,
 -                                             NETLINK_CB(cb->skb).portid,
 -                                             nlh->nlmsg_seq, 0, flags,
 -                                             ext_filter_mask, 0, NULL, 0,
 -                                             netnsid, GFP_KERNEL);
 -
 -                      if (err < 0) {
 -                              if (likely(skb->len))
 -                                      goto out;
 -
 -                              goto out_err;
 -                      }
 -cont:
 -                      idx++;
 +      err = 0;
 +      for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
 +              if (link_dump_filtered(dev, master_idx, kind_ops))
 +                      continue;
 +              err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK,
 +                                     NETLINK_CB(cb->skb).portid,
 +                                     nlh->nlmsg_seq, 0, flags,
 +                                     ext_filter_mask, 0, NULL, 0,
 +                                     netnsid, GFP_KERNEL);
 +              if (err < 0) {
 +                      if (likely(skb->len))
 +                              err = skb->len;
 +                      break;
                }
        }
 -out:
 -      err = skb->len;
 -out_err:
 -      cb->args[1] = idx;
 -      cb->args[0] = h;
        cb->seq = tgt_net->dev_base_seq;
        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
        if (netnsid >= 0)
@@@ -2961,9 -2983,11 +2964,9 @@@ static int do_setlink(const struct sk_b
        if (tb[IFLA_LINKMODE]) {
                unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]);
  
 -              write_lock(&dev_base_lock);
                if (dev->link_mode ^ value)
                        status |= DO_SETLINK_NOTIFY;
 -              dev->link_mode = value;
 -              write_unlock(&dev_base_lock);
 +              WRITE_ONCE(dev->link_mode, value);
        }
  
        if (tb[IFLA_VFINFO_LIST]) {
diff --combined net/ipv4/ip_gre.c
index aad5125b7a65ecc770f1b962ac5b417bd931e3ba,6b9cf5a24c19ff06634f7841141b8a30639b8d17..7b16c211b904473cc5e350aafdefb86fbf1b3693
@@@ -1025,16 -1025,14 +1025,16 @@@ static int __net_init ipgre_init_net(st
        return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
  }
  
 -static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
 +static void __net_exit ipgre_exit_batch_rtnl(struct list_head *list_net,
 +                                           struct list_head *dev_to_kill)
  {
 -      ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
 +      ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops,
 +                            dev_to_kill);
  }
  
  static struct pernet_operations ipgre_net_ops = {
        .init = ipgre_init_net,
 -      .exit_batch = ipgre_exit_batch_net,
 +      .exit_batch_rtnl = ipgre_exit_batch_rtnl,
        .id   = &ipgre_net_id,
        .size = sizeof(struct ip_tunnel_net),
  };
@@@ -1699,16 -1697,14 +1699,16 @@@ static int __net_init ipgre_tap_init_ne
        return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
  }
  
 -static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
 +static void __net_exit ipgre_tap_exit_batch_rtnl(struct list_head *list_net,
 +                                               struct list_head *dev_to_kill)
  {
 -      ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
 +      ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops,
 +                            dev_to_kill);
  }
  
  static struct pernet_operations ipgre_tap_net_ops = {
        .init = ipgre_tap_init_net,
 -      .exit_batch = ipgre_tap_exit_batch_net,
 +      .exit_batch_rtnl = ipgre_tap_exit_batch_rtnl,
        .id   = &gre_tap_net_id,
        .size = sizeof(struct ip_tunnel_net),
  };
@@@ -1719,16 -1715,14 +1719,16 @@@ static int __net_init erspan_init_net(s
                                  &erspan_link_ops, "erspan0");
  }
  
 -static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
 +static void __net_exit erspan_exit_batch_rtnl(struct list_head *net_list,
 +                                            struct list_head *dev_to_kill)
  {
 -      ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
 +      ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops,
 +                            dev_to_kill);
  }
  
  static struct pernet_operations erspan_net_ops = {
        .init = erspan_init_net,
 -      .exit_batch = erspan_exit_batch_net,
 +      .exit_batch_rtnl = erspan_exit_batch_rtnl,
        .id   = &erspan_net_id,
        .size = sizeof(struct ip_tunnel_net),
  };
@@@ -1799,6 -1793,7 +1799,7 @@@ static void __exit ipgre_fini(void
  
  module_init(ipgre_init);
  module_exit(ipgre_fini);
+ MODULE_DESCRIPTION("IPv4 GRE tunnels over IP library");
  MODULE_LICENSE("GPL");
  MODULE_ALIAS_RTNL_LINK("gre");
  MODULE_ALIAS_RTNL_LINK("gretap");
diff --combined net/ipv4/ip_output.c
index 5b5a0adb927ffaf2a925094e421cc4f620d22a9c,67d846622365e8da9c2295f76943a504d16b066f..1fe794967211e249016df00dc3c2ae230d71dcff
@@@ -493,7 -493,7 +493,7 @@@ int __ip_queue_xmit(struct sock *sk, st
                                           inet->inet_dport,
                                           inet->inet_sport,
                                           sk->sk_protocol,
 -                                         RT_CONN_FLAGS_TOS(sk, tos),
 +                                         RT_TOS(tos),
                                           sk->sk_bound_dev_if);
                if (IS_ERR(rt))
                        goto no_route;
@@@ -972,8 -972,8 +972,8 @@@ static int __ip_append_data(struct soc
        unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
        int csummode = CHECKSUM_NONE;
        struct rtable *rt = (struct rtable *)cork->dst;
+       bool paged, hold_tskey, extra_uref = false;
        unsigned int wmem_alloc_delta = 0;
-       bool paged, extra_uref = false;
        u32 tskey = 0;
  
        skb = skb_peek_tail(queue);
        mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
        paged = !!cork->gso_size;
  
-       if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
-           READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID)
-               tskey = atomic_inc_return(&sk->sk_tskey) - 1;
        hh_len = LL_RESERVED_SPACE(rt->dst.dev);
  
        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
  
        cork->length += length;
  
+       hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP &&
+                    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID;
+       if (hold_tskey)
+               tskey = atomic_inc_return(&sk->sk_tskey) - 1;
        /* So, what's going on in the loop below?
         *
         * We use calculated fragment length to generate chained skb,
@@@ -1274,6 -1275,8 +1275,8 @@@ error
        cork->length -= length;
        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
        refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
+       if (hold_tskey)
+               atomic_dec(&sk->sk_tskey);
        return err;
  }
  
diff --combined net/ipv4/ip_tunnel.c
index 9f44c49a61dee6f645932cca18dcbf786a040edd,a4513ffb66cbb74c14112bbc4c1d36d02e7f659b..756f8b923883c69d4300356dc8ad504b807e2513
@@@ -102,9 -102,10 +102,9 @@@ struct ip_tunnel *ip_tunnel_lookup(stru
                if (!ip_tunnel_key_match(&t->parms, flags, key))
                        continue;
  
 -              if (t->parms.link == link)
 +              if (READ_ONCE(t->parms.link) == link)
                        return t;
 -              else
 -                      cand = t;
 +              cand = t;
        }
  
        hlist_for_each_entry_rcu(t, head, hash_node) {
                if (!ip_tunnel_key_match(&t->parms, flags, key))
                        continue;
  
 -              if (t->parms.link == link)
 +              if (READ_ONCE(t->parms.link) == link)
                        return t;
 -              else if (!cand)
 +              if (!cand)
                        cand = t;
        }
  
                if (!ip_tunnel_key_match(&t->parms, flags, key))
                        continue;
  
 -              if (t->parms.link == link)
 +              if (READ_ONCE(t->parms.link) == link)
                        return t;
 -              else if (!cand)
 +              if (!cand)
                        cand = t;
        }
  
                    !(t->dev->flags & IFF_UP))
                        continue;
  
 -              if (t->parms.link == link)
 +              if (READ_ONCE(t->parms.link) == link)
                        return t;
 -              else if (!cand)
 +              if (!cand)
                        cand = t;
        }
  
@@@ -220,7 -221,7 +220,7 @@@ static struct ip_tunnel *ip_tunnel_find
        hlist_for_each_entry_rcu(t, head, hash_node) {
                if (local == t->parms.iph.saddr &&
                    remote == t->parms.iph.daddr &&
 -                  link == t->parms.link &&
 +                  link == READ_ONCE(t->parms.link) &&
                    type == t->dev->type &&
                    ip_tunnel_key_match(&t->parms, flags, key))
                        break;
@@@ -746,7 -747,7 +746,7 @@@ void ip_tunnel_xmit(struct sk_buff *skb
  
        ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
                            tunnel->parms.o_key, RT_TOS(tos),
 -                          dev_net(dev), tunnel->parms.link,
 +                          dev_net(dev), READ_ONCE(tunnel->parms.link),
                            tunnel->fwmark, skb_get_hash(skb), 0);
  
        if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
@@@ -866,7 -867,7 +866,7 @@@ static void ip_tunnel_update(struct ip_
        if (t->parms.link != p->link || t->fwmark != fwmark) {
                int mtu;
  
 -              t->parms.link = p->link;
 +              WRITE_ONCE(t->parms.link, p->link);
                t->fwmark = fwmark;
                mtu = ip_tunnel_bind_dev(dev);
                if (set_mtu)
@@@ -1056,9 -1057,9 +1056,9 @@@ EXPORT_SYMBOL(ip_tunnel_get_link_net)
  
  int ip_tunnel_get_iflink(const struct net_device *dev)
  {
 -      struct ip_tunnel *tunnel = netdev_priv(dev);
 +      const struct ip_tunnel *tunnel = netdev_priv(dev);
  
 -      return tunnel->parms.link;
 +      return READ_ONCE(tunnel->parms.link);
  }
  EXPORT_SYMBOL(ip_tunnel_get_iflink);
  
@@@ -1129,17 -1130,19 +1129,17 @@@ static void ip_tunnel_destroy(struct ne
  }
  
  void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
 -                         struct rtnl_link_ops *ops)
 +                         struct rtnl_link_ops *ops,
 +                         struct list_head *dev_to_kill)
  {
        struct ip_tunnel_net *itn;
        struct net *net;
 -      LIST_HEAD(list);
  
 -      rtnl_lock();
 +      ASSERT_RTNL();
        list_for_each_entry(net, net_list, exit_list) {
                itn = net_generic(net, id);
 -              ip_tunnel_destroy(net, itn, &list, ops);
 +              ip_tunnel_destroy(net, itn, dev_to_kill, ops);
        }
 -      unregister_netdevice_many(&list);
 -      rtnl_unlock();
  }
  EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
  
@@@ -1268,7 -1271,6 +1268,7 @@@ int ip_tunnel_init(struct net_device *d
  
        if (tunnel->collect_md)
                netif_keep_dst(dev);
 +      netdev_lockdep_set_classes(dev);
        return 0;
  }
  EXPORT_SYMBOL_GPL(ip_tunnel_init);
@@@ -1296,4 -1298,5 +1296,5 @@@ void ip_tunnel_setup(struct net_device 
  }
  EXPORT_SYMBOL_GPL(ip_tunnel_setup);
  
+ MODULE_DESCRIPTION("IPv4 tunnel implementation library");
  MODULE_LICENSE("GPL");
diff --combined net/ipv4/ip_vti.c
index fb1f52d2131128a39ab5bf0482359b7b75989fb6,d1d6bb28ed6e95c6e9c247bf1df1b27287bc8328..ee587adb169f6a1c6466ff2c997ca85a2a97e8e0
@@@ -510,16 -510,14 +510,16 @@@ static int __net_init vti_init_net(stru
        return 0;
  }
  
 -static void __net_exit vti_exit_batch_net(struct list_head *list_net)
 +static void __net_exit vti_exit_batch_rtnl(struct list_head *list_net,
 +                                         struct list_head *dev_to_kill)
  {
 -      ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops);
 +      ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops,
 +                            dev_to_kill);
  }
  
  static struct pernet_operations vti_net_ops = {
        .init = vti_init_net,
 -      .exit_batch = vti_exit_batch_net,
 +      .exit_batch_rtnl = vti_exit_batch_rtnl,
        .id   = &vti_net_id,
        .size = sizeof(struct ip_tunnel_net),
  };
@@@ -723,6 -721,7 +723,7 @@@ static void __exit vti_fini(void
  
  module_init(vti_init);
  module_exit(vti_fini);
+ MODULE_DESCRIPTION("Virtual (secure) IP tunneling library");
  MODULE_LICENSE("GPL");
  MODULE_ALIAS_RTNL_LINK("vti");
  MODULE_ALIAS_NETDEV("ip_vti0");
diff --combined net/ipv4/ipip.c
index 0151eea06cc50bec4ae64f08ca6a7161e3cbf9ae,03afa3871efc53b5af543e7d53283be69a02f818..f2696eaadbe69d4d46a2fc576ffff1a13cae8c88
@@@ -592,16 -592,14 +592,16 @@@ static int __net_init ipip_init_net(str
        return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
  }
  
 -static void __net_exit ipip_exit_batch_net(struct list_head *list_net)
 +static void __net_exit ipip_exit_batch_rtnl(struct list_head *list_net,
 +                                          struct list_head *dev_to_kill)
  {
 -      ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops);
 +      ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops,
 +                            dev_to_kill);
  }
  
  static struct pernet_operations ipip_net_ops = {
        .init = ipip_init_net,
 -      .exit_batch = ipip_exit_batch_net,
 +      .exit_batch_rtnl = ipip_exit_batch_rtnl,
        .id   = &ipip_net_id,
        .size = sizeof(struct ip_tunnel_net),
  };
@@@ -660,6 -658,7 +660,7 @@@ static void __exit ipip_fini(void
  
  module_init(ipip_init);
  module_exit(ipip_fini);
+ MODULE_DESCRIPTION("IP/IP protocol decoder library");
  MODULE_LICENSE("GPL");
  MODULE_ALIAS_RTNL_LINK("ipip");
  MODULE_ALIAS_NETDEV("tunl0");
diff --combined net/ipv6/sit.c
index b2da1f1b5fec4d784268cb04c78ddff87d1ca576,5e9f625b76e36b9a61c6c2db0b4163e78dca549a..ed3a44aa1e9d857ceb626eb5c879bd74374d2315
@@@ -1460,7 -1460,6 +1460,7 @@@ static int ipip6_tunnel_init(struct net
                return err;
        }
        netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL);
 +      netdev_lockdep_set_classes(dev);
        return 0;
  }
  
@@@ -1876,19 -1875,22 +1876,19 @@@ err_alloc_dev
        return err;
  }
  
 -static void __net_exit sit_exit_batch_net(struct list_head *net_list)
 +static void __net_exit sit_exit_batch_rtnl(struct list_head *net_list,
 +                                         struct list_head *dev_to_kill)
  {
 -      LIST_HEAD(list);
        struct net *net;
  
 -      rtnl_lock();
 +      ASSERT_RTNL();
        list_for_each_entry(net, net_list, exit_list)
 -              sit_destroy_tunnels(net, &list);
 -
 -      unregister_netdevice_many(&list);
 -      rtnl_unlock();
 +              sit_destroy_tunnels(net, dev_to_kill);
  }
  
  static struct pernet_operations sit_net_ops = {
        .init = sit_init_net,
 -      .exit_batch = sit_exit_batch_net,
 +      .exit_batch_rtnl = sit_exit_batch_rtnl,
        .id   = &sit_net_id,
        .size = sizeof(struct sit_net),
  };
@@@ -1954,6 -1956,7 +1954,7 @@@ xfrm_tunnel_failed
  
  module_init(sit_init);
  module_exit(sit_cleanup);
+ MODULE_DESCRIPTION("IPv6-in-IPv4 tunnel SIT driver");
  MODULE_LICENSE("GPL");
  MODULE_ALIAS_RTNL_LINK("sit");
  MODULE_ALIAS_NETDEV("sit0");
diff --combined net/mptcp/options.c
index 801a3525230d010cb831de4fe9ff8e9a0c44cbb7,e3e96a49f92296aed056137a815f0e2a30b8407c..23e317ffc9015b6660cf77b30a57780d52081af5
@@@ -689,8 -689,8 +689,8 @@@ static bool mptcp_established_options_a
        opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
        if (!echo) {
                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX);
 -              opts->ahmac = add_addr_generate_hmac(msk->local_key,
 -                                                   msk->remote_key,
 +              opts->ahmac = add_addr_generate_hmac(READ_ONCE(msk->local_key),
 +                                                   READ_ONCE(msk->remote_key),
                                                     &opts->addr);
        } else {
                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX);
@@@ -792,7 -792,7 +792,7 @@@ static bool mptcp_established_options_f
  
        *size = TCPOLEN_MPTCP_FASTCLOSE;
        opts->suboptions |= OPTION_MPTCP_FASTCLOSE;
 -      opts->rcvr_key = msk->remote_key;
 +      opts->rcvr_key = READ_ONCE(msk->remote_key);
  
        pr_debug("FASTCLOSE key=%llu", opts->rcvr_key);
        MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX);
@@@ -962,9 -962,7 +962,7 @@@ static bool check_fully_established(str
                /* subflows are fully established as soon as we get any
                 * additional ack, including ADD_ADDR.
                 */
-               subflow->fully_established = 1;
-               WRITE_ONCE(msk->fully_established, true);
-               goto check_notify;
+               goto set_fully_established;
        }
  
        /* If the first established packet does not contain MP_CAPABLE + data
  set_fully_established:
        if (unlikely(!READ_ONCE(msk->pm.server_side)))
                pr_warn_once("bogus mpc option on established client sk");
-       mptcp_subflow_fully_established(subflow, mp_opt);
+       mptcp_data_lock((struct sock *)msk);
+       __mptcp_subflow_fully_established(msk, subflow, mp_opt);
+       mptcp_data_unlock((struct sock *)msk);
  
  check_notify:
        /* if the subflow is not already linked into the conn_list, we can't
@@@ -1030,7 -1031,7 +1031,7 @@@ u64 __mptcp_expand_seq(u64 old_seq, u6
  static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una)
  {
        msk->bytes_acked += new_snd_una - msk->snd_una;
 -      msk->snd_una = new_snd_una;
 +      WRITE_ONCE(msk->snd_una, new_snd_una);
  }
  
  static void ack_update_msk(struct mptcp_sock *msk,
        new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;
  
        if (after64(new_wnd_end, msk->wnd_end))
 -              msk->wnd_end = new_wnd_end;
 +              WRITE_ONCE(msk->wnd_end, new_wnd_end);
  
        /* this assumes mptcp_incoming_options() is invoked after tcp_ack() */
 -      if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt)))
 +      if (after64(msk->wnd_end, snd_nxt))
                __mptcp_check_push(sk, ssk);
  
        if (after64(new_snd_una, old_snd_una)) {
  
        trace_ack_update_msk(mp_opt->data_ack,
                             old_snd_una, new_snd_una,
 -                           new_wnd_end, msk->wnd_end);
 +                           new_wnd_end, READ_ONCE(msk->wnd_end));
  }
  
  bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit)
@@@ -1099,8 -1100,8 +1100,8 @@@ static bool add_addr_hmac_valid(struct 
        if (mp_opt->echo)
                return true;
  
 -      hmac = add_addr_generate_hmac(msk->remote_key,
 -                                    msk->local_key,
 +      hmac = add_addr_generate_hmac(READ_ONCE(msk->remote_key),
 +                                    READ_ONCE(msk->local_key),
                                      &mp_opt->addr);
  
        pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
@@@ -1147,7 -1148,7 +1148,7 @@@ bool mptcp_incoming_options(struct soc
  
        if (unlikely(mp_opt.suboptions != OPTION_MPTCP_DSS)) {
                if ((mp_opt.suboptions & OPTION_MPTCP_FASTCLOSE) &&
 -                  msk->local_key == mp_opt.rcvr_key) {
 +                  READ_ONCE(msk->local_key) == mp_opt.rcvr_key) {
                        WRITE_ONCE(msk->rcv_fastclose, true);
                        mptcp_schedule_work((struct sock *)msk);
                        MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSERX);
diff --combined net/mptcp/protocol.c
index ad39f54b3a81b6745f28cb8d0ed8907bf194ac26,8ef2927ebca297bf60d51fae91732e09562fd496..c7af62c057bc727e456ad0e57f4271c08ce67ea2
@@@ -410,7 -410,6 +410,7 @@@ static void mptcp_close_wake_up(struct 
                sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
  }
  
 +/* called under the msk socket lock */
  static bool mptcp_pending_data_fin_ack(struct sock *sk)
  {
        struct mptcp_sock *msk = mptcp_sk(sk);
@@@ -442,17 -441,16 +442,17 @@@ static void mptcp_check_data_fin_ack(st
        }
  }
  
 +/* can be called with no lock acquired */
  static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq)
  {
        struct mptcp_sock *msk = mptcp_sk(sk);
  
        if (READ_ONCE(msk->rcv_data_fin) &&
 -          ((1 << sk->sk_state) &
 +          ((1 << inet_sk_state_load(sk)) &
             (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) {
                u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq);
  
 -              if (msk->ack_seq == rcv_data_fin_seq) {
 +              if (READ_ONCE(msk->ack_seq) == rcv_data_fin_seq) {
                        if (seq)
                                *seq = rcv_data_fin_seq;
  
@@@ -750,7 -748,7 +750,7 @@@ static bool __mptcp_ofo_queue(struct mp
                        __skb_queue_tail(&sk->sk_receive_queue, skb);
                }
                msk->bytes_received += end_seq - msk->ack_seq;
 -              msk->ack_seq = end_seq;
 +              WRITE_ONCE(msk->ack_seq, end_seq);
                moved = true;
        }
        return moved;
@@@ -987,7 -985,6 +987,7 @@@ static void dfrag_clear(struct sock *sk
        put_page(dfrag->page);
  }
  
 +/* called under both the msk socket lock and the data lock */
  static void __mptcp_clean_una(struct sock *sk)
  {
        struct mptcp_sock *msk = mptcp_sk(sk);
                msk->recovery = false;
  
  out:
 -      if (snd_una == READ_ONCE(msk->snd_nxt) &&
 -          snd_una == READ_ONCE(msk->write_seq)) {
 +      if (snd_una == msk->snd_nxt && snd_una == msk->write_seq) {
                if (mptcp_rtx_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
                        mptcp_stop_rtx_timer(sk);
        } else {
                mptcp_reset_rtx_timer(sk);
        }
 +
 +      if (mptcp_pending_data_fin_ack(sk))
 +              mptcp_schedule_work(sk);
  }
  
  static void __mptcp_clean_una_wakeup(struct sock *sk)
@@@ -1504,14 -1499,17 +1504,17 @@@ static void mptcp_update_post_push(stru
         */
        if (likely(after64(snd_nxt_new, msk->snd_nxt))) {
                msk->bytes_sent += snd_nxt_new - msk->snd_nxt;
 -              msk->snd_nxt = snd_nxt_new;
 +              WRITE_ONCE(msk->snd_nxt, snd_nxt_new);
        }
  }
  
  void mptcp_check_and_set_pending(struct sock *sk)
  {
-       if (mptcp_send_head(sk))
-               mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING);
+       if (mptcp_send_head(sk)) {
+               mptcp_data_lock(sk);
+               mptcp_sk(sk)->cb_flags |= BIT(MPTCP_PUSH_PENDING);
+               mptcp_data_unlock(sk);
+       }
  }
  
  static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
@@@ -1965,6 -1963,9 +1968,9 @@@ static void mptcp_rcv_space_adjust(stru
        if (copied <= 0)
                return;
  
+       if (!msk->rcvspace_init)
+               mptcp_rcv_space_init(msk, msk->first);
        msk->rcvq_space.copied += copied;
  
        mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC);
@@@ -2113,7 -2114,7 +2119,7 @@@ static unsigned int mptcp_inq_hint(cons
  
        skb = skb_peek(&msk->receive_queue);
        if (skb) {
 -              u64 hint_val = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
 +              u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq;
  
                if (hint_val >= INT_MAX)
                        return INT_MAX;
@@@ -2757,7 -2758,7 +2763,7 @@@ static void __mptcp_init_sock(struct so
        __skb_queue_head_init(&msk->receive_queue);
        msk->out_of_order_queue = RB_ROOT;
        msk->first_pending = NULL;
 -      msk->rmem_fwd_alloc = 0;
 +      WRITE_ONCE(msk->rmem_fwd_alloc, 0);
        WRITE_ONCE(msk->rmem_released, 0);
        msk->timer_ival = TCP_RTO_MIN;
        msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
@@@ -2973,7 -2974,7 +2979,7 @@@ static void __mptcp_destroy_sock(struc
  
        sk->sk_prot->destroy(sk);
  
 -      WARN_ON_ONCE(msk->rmem_fwd_alloc);
 +      WARN_ON_ONCE(READ_ONCE(msk->rmem_fwd_alloc));
        WARN_ON_ONCE(msk->rmem_released);
        sk_stream_kill_queues(sk);
        xfrm_sk_free_policy(sk);
@@@ -3147,22 -3148,22 +3153,22 @@@ static int mptcp_disconnect(struct soc
        mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE);
        WRITE_ONCE(msk->flags, 0);
        msk->cb_flags = 0;
-       msk->push_pending = 0;
        msk->recovery = false;
 -      msk->can_ack = false;
 -      msk->fully_established = false;
 -      msk->rcv_data_fin = false;
 -      msk->snd_data_fin_enable = false;
 -      msk->rcv_fastclose = false;
 -      msk->use_64bit_ack = false;
 -      msk->bytes_consumed = 0;
 +      WRITE_ONCE(msk->can_ack, false);
 +      WRITE_ONCE(msk->fully_established, false);
 +      WRITE_ONCE(msk->rcv_data_fin, false);
 +      WRITE_ONCE(msk->snd_data_fin_enable, false);
 +      WRITE_ONCE(msk->rcv_fastclose, false);
 +      WRITE_ONCE(msk->use_64bit_ack, false);
        WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
        mptcp_pm_data_reset(msk);
        mptcp_ca_reset(sk);
 +      msk->bytes_consumed = 0;
        msk->bytes_acked = 0;
        msk->bytes_received = 0;
        msk->bytes_sent = 0;
        msk->bytes_retrans = 0;
+       msk->rcvspace_init = 0;
  
        WRITE_ONCE(sk->sk_shutdown, 0);
        sk_error_report(sk);
@@@ -3185,6 -3186,7 +3191,7 @@@ struct sock *mptcp_sk_clone_init(const 
  {
        struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
        struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC);
+       struct mptcp_subflow_context *subflow;
        struct mptcp_sock *msk;
  
        if (!nsk)
        __mptcp_init_sock(nsk);
  
        msk = mptcp_sk(nsk);
 -      msk->local_key = subflow_req->local_key;
 -      msk->token = subflow_req->token;
 +      WRITE_ONCE(msk->local_key, subflow_req->local_key);
 +      WRITE_ONCE(msk->token, subflow_req->token);
        msk->in_accept_queue = 1;
        WRITE_ONCE(msk->fully_established, false);
        if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD)
                WRITE_ONCE(msk->csum_enabled, true);
  
 -      msk->write_seq = subflow_req->idsn + 1;
 -      msk->snd_nxt = msk->write_seq;
 -      msk->snd_una = msk->write_seq;
 -      msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
 +      WRITE_ONCE(msk->write_seq, subflow_req->idsn + 1);
 +      WRITE_ONCE(msk->snd_nxt, msk->write_seq);
 +      WRITE_ONCE(msk->snd_una, msk->write_seq);
 +      WRITE_ONCE(msk->wnd_end, msk->snd_nxt + req->rsk_rcv_wnd);
        msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;
        mptcp_init_sched(msk, mptcp_sk(sk)->sched);
  
  
        /* The msk maintain a ref to each subflow in the connections list */
        WRITE_ONCE(msk->first, ssk);
-       list_add(&mptcp_subflow_ctx(ssk)->node, &msk->conn_list);
+       subflow = mptcp_subflow_ctx(ssk);
+       list_add(&subflow->node, &msk->conn_list);
        sock_hold(ssk);
  
        /* new mpc subflow takes ownership of the newly
        __mptcp_propagate_sndbuf(nsk, ssk);
  
        mptcp_rcv_space_init(msk, ssk);
+       if (mp_opt->suboptions & OPTION_MPTCP_MPC_ACK)
+               __mptcp_subflow_fully_established(msk, subflow, mp_opt);
        bh_unlock_sock(nsk);
  
        /* note: the newly allocated socket refcount is 2 now */
@@@ -3250,6 -3256,7 +3261,7 @@@ void mptcp_rcv_space_init(struct mptcp_
  {
        const struct tcp_sock *tp = tcp_sk(ssk);
  
+       msk->rcvspace_init = 1;
        msk->rcvq_space.copied = 0;
        msk->rcvq_space.rtt_us = 0;
  
                                      TCP_INIT_CWND * tp->advmss);
        if (msk->rcvq_space.space == 0)
                msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
-       WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd);
  }
  
  void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)
@@@ -3308,6 -3313,9 +3318,6 @@@ void __mptcp_data_acked(struct sock *sk
                __mptcp_clean_una(sk);
        else
                __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags);
 -
 -      if (mptcp_pending_data_fin_ack(sk))
 -              mptcp_schedule_work(sk);
  }
  
  void __mptcp_check_push(struct sock *sk, struct sock *ssk)
@@@ -3332,8 -3340,7 +3342,7 @@@ static void mptcp_release_cb(struct soc
        struct mptcp_sock *msk = mptcp_sk(sk);
  
        for (;;) {
-               unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) |
-                                     msk->push_pending;
+               unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED);
                struct list_head join_list;
  
                if (!flags)
                 *    datapath acquires the msk socket spinlock while helding
                 *    the subflow socket lock
                 */
-               msk->push_pending = 0;
                msk->cb_flags &= ~flags;
                spin_unlock_bh(&sk->sk_lock.slock);
  
@@@ -3477,13 -3483,8 +3485,8 @@@ void mptcp_finish_connect(struct sock *
         * accessing the field below
         */
        WRITE_ONCE(msk->local_key, subflow->local_key);
-       WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
-       WRITE_ONCE(msk->snd_nxt, msk->write_seq);
-       WRITE_ONCE(msk->snd_una, msk->write_seq);
  
        mptcp_pm_new_connection(msk, ssk, 0);
-       mptcp_rcv_space_init(msk, ssk);
  }
  
  void mptcp_sock_graft(struct sock *sk, struct socket *parent)
diff --combined net/mptcp/protocol.h
index 421dede93e2b70558722a18c9827766d16bef5c3,ed50f2015dc389d035e919a5d509e12899e22687..c5ec056040eb7cc090ba82df516dc8c24e237f1d
@@@ -260,10 -260,8 +260,10 @@@ struct mptcp_data_frag 
  struct mptcp_sock {
        /* inet_connection_sock must be the first member */
        struct inet_connection_sock sk;
 -      u64             local_key;
 -      u64             remote_key;
 +      u64             local_key;              /* protected by the first subflow socket lock
 +                                               * lockless access read
 +                                               */
 +      u64             remote_key;             /* same as above */
        u64             write_seq;
        u64             bytes_sent;
        u64             snd_nxt;
        int             rmem_released;
        unsigned long   flags;
        unsigned long   cb_flags;
-       unsigned long   push_pending;
        bool            recovery;               /* closing subflow write queue reinjected */
        bool            can_ack;
        bool            fully_established;
                        nodelay:1,
                        fastopening:1,
                        in_accept_queue:1,
-                       free_first:1;
+                       free_first:1,
+                       rcvspace_init:1;
        struct work_struct work;
        struct sk_buff  *ooo_last_skb;
        struct rb_root  out_of_order_queue;
@@@ -402,7 -400,7 +402,7 @@@ static inline struct mptcp_data_frag *m
  {
        struct mptcp_sock *msk = mptcp_sk(sk);
  
 -      if (msk->snd_una == READ_ONCE(msk->snd_nxt))
 +      if (msk->snd_una == msk->snd_nxt)
                return NULL;
  
        return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list);
@@@ -624,8 -622,9 +624,9 @@@ unsigned int mptcp_stale_loss_cnt(cons
  unsigned int mptcp_close_timeout(const struct sock *sk);
  int mptcp_get_pm_type(const struct net *net);
  const char *mptcp_get_scheduler(const struct net *net);
- void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
-                                    const struct mptcp_options_received *mp_opt);
+ void __mptcp_subflow_fully_established(struct mptcp_sock *msk,
+                                      struct mptcp_subflow_context *subflow,
+                                      const struct mptcp_options_received *mp_opt);
  bool __mptcp_retransmit_pending_data(struct sock *sk);
  void mptcp_check_and_set_pending(struct sock *sk);
  void __mptcp_push_pending(struct sock *sk, unsigned int flags);
@@@ -954,8 -953,8 +955,8 @@@ void mptcp_event_pm_listener(const stru
                             enum mptcp_event_type event);
  bool mptcp_userspace_pm_active(const struct mptcp_sock *msk);
  
- void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
-                                  const struct mptcp_options_received *mp_opt);
+ void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
+                                    const struct mptcp_options_received *mp_opt);
  void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow,
                                              struct request_sock *req);
  
@@@ -1130,7 -1129,8 +1131,8 @@@ static inline bool subflow_simultaneous
  {
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
  
-       return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1) &&
+       return (1 << sk->sk_state) &
+              (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING) &&
               is_active_ssk(subflow) &&
               !subflow->conn_finished;
  }
diff --combined net/mptcp/subflow.c
index d60b83511302b48cd7e65d3291cb477a973bf593,c34ecadee1200a4804ea732df0b3d8a7b4f6e174..02dab0669cfcf38183f24ee51dd2a01ed6f3a3c6
@@@ -75,8 -75,7 +75,8 @@@ static void subflow_req_create_thmac(st
  
        get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
  
 -      subflow_generate_hmac(msk->local_key, msk->remote_key,
 +      subflow_generate_hmac(READ_ONCE(msk->local_key),
 +                            READ_ONCE(msk->remote_key),
                              subflow_req->local_nonce,
                              subflow_req->remote_nonce, hmac);
  
@@@ -422,29 -421,26 +422,26 @@@ static bool subflow_use_different_dport
  
  void __mptcp_sync_state(struct sock *sk, int state)
  {
+       struct mptcp_subflow_context *subflow;
        struct mptcp_sock *msk = mptcp_sk(sk);
+       struct sock *ssk = msk->first;
+       subflow = mptcp_subflow_ctx(ssk);
+       __mptcp_propagate_sndbuf(sk, ssk);
+       if (!msk->rcvspace_init)
+               mptcp_rcv_space_init(msk, ssk);
  
-       __mptcp_propagate_sndbuf(sk, msk->first);
        if (sk->sk_state == TCP_SYN_SENT) {
+               /* subflow->idsn is always available is TCP_SYN_SENT state,
+                * even for the FASTOPEN scenarios
+                */
+               WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
+               WRITE_ONCE(msk->snd_nxt, msk->write_seq);
                mptcp_set_state(sk, state);
                sk->sk_state_change(sk);
        }
  }
  
- static void mptcp_propagate_state(struct sock *sk, struct sock *ssk)
- {
-       struct mptcp_sock *msk = mptcp_sk(sk);
-       mptcp_data_lock(sk);
-       if (!sock_owned_by_user(sk)) {
-               __mptcp_sync_state(sk, ssk->sk_state);
-       } else {
-               msk->pending_state = ssk->sk_state;
-               __set_bit(MPTCP_SYNC_STATE, &msk->cb_flags);
-       }
-       mptcp_data_unlock(sk);
- }
  static void subflow_set_remote_key(struct mptcp_sock *msk,
                                   struct mptcp_subflow_context *subflow,
                                   const struct mptcp_options_received *mp_opt)
        atomic64_set(&msk->rcv_wnd_sent, subflow->iasn);
  }
  
+ static void mptcp_propagate_state(struct sock *sk, struct sock *ssk,
+                                 struct mptcp_subflow_context *subflow,
+                                 const struct mptcp_options_received *mp_opt)
+ {
+       struct mptcp_sock *msk = mptcp_sk(sk);
+       mptcp_data_lock(sk);
+       if (mp_opt) {
+               /* Options are available only in the non fallback cases
+                * avoid updating rx path fields otherwise
+                */
+               WRITE_ONCE(msk->snd_una, subflow->idsn + 1);
+               WRITE_ONCE(msk->wnd_end, subflow->idsn + 1 + tcp_sk(ssk)->snd_wnd);
+               subflow_set_remote_key(msk, subflow, mp_opt);
+       }
+       if (!sock_owned_by_user(sk)) {
+               __mptcp_sync_state(sk, ssk->sk_state);
+       } else {
+               msk->pending_state = ssk->sk_state;
+               __set_bit(MPTCP_SYNC_STATE, &msk->cb_flags);
+       }
+       mptcp_data_unlock(sk);
+ }
  static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
  {
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
                if (mp_opt.deny_join_id0)
                        WRITE_ONCE(msk->pm.remote_deny_join_id0, true);
                subflow->mp_capable = 1;
-               subflow_set_remote_key(msk, subflow, &mp_opt);
                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK);
                mptcp_finish_connect(sk);
-               mptcp_propagate_state(parent, sk);
+               mptcp_propagate_state(parent, sk, subflow, &mp_opt);
        } else if (subflow->request_join) {
                u8 hmac[SHA256_DIGEST_SIZE];
  
                }
        } else if (mptcp_check_fallback(sk)) {
  fallback:
-               mptcp_rcv_space_init(msk, sk);
-               mptcp_propagate_state(parent, sk);
+               mptcp_propagate_state(parent, sk, subflow, NULL);
        }
        return;
  
@@@ -695,8 -714,7 +715,8 @@@ static bool subflow_hmac_valid(const st
        if (!msk)
                return false;
  
 -      subflow_generate_hmac(msk->remote_key, msk->local_key,
 +      subflow_generate_hmac(READ_ONCE(msk->remote_key),
 +                            READ_ONCE(msk->local_key),
                              subflow_req->remote_nonce,
                              subflow_req->local_nonce, hmac);
  
@@@ -733,17 -751,16 +753,16 @@@ void mptcp_subflow_drop_ctx(struct soc
        kfree_rcu(ctx, rcu);
  }
  
- void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
-                                    const struct mptcp_options_received *mp_opt)
+ void __mptcp_subflow_fully_established(struct mptcp_sock *msk,
+                                      struct mptcp_subflow_context *subflow,
+                                      const struct mptcp_options_received *mp_opt)
  {
-       struct mptcp_sock *msk = mptcp_sk(subflow->conn);
        subflow_set_remote_key(msk, subflow, mp_opt);
        subflow->fully_established = 1;
        WRITE_ONCE(msk->fully_established, true);
  
        if (subflow->is_mptfo)
-               mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt);
+               __mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt);
  }
  
  static struct sock *subflow_syn_recv_sock(const struct sock *sk,
@@@ -836,7 -853,6 +855,6 @@@ create_child
                         * mpc option
                         */
                        if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) {
-                               mptcp_subflow_fully_established(ctx, &mp_opt);
                                mptcp_pm_fully_established(owner, child);
                                ctx->pm_notified = 1;
                        }
@@@ -1532,8 -1548,8 +1550,8 @@@ int __mptcp_subflow_connect(struct soc
        mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id,
                                             &flags, &ifindex);
        subflow->remote_key_valid = 1;
 -      subflow->remote_key = msk->remote_key;
 -      subflow->local_key = msk->local_key;
 +      subflow->remote_key = READ_ONCE(msk->remote_key);
 +      subflow->local_key = READ_ONCE(msk->local_key);
        subflow->token = msk->token;
        mptcp_info2sockaddr(loc, &addr, ssk->sk_family);
  
@@@ -1746,10 -1762,9 +1764,9 @@@ static void subflow_state_change(struc
        msk = mptcp_sk(parent);
        if (subflow_simultaneous_connect(sk)) {
                mptcp_do_fallback(sk);
-               mptcp_rcv_space_init(msk, sk);
                pr_fallback(msk);
                subflow->conn_finished = 1;
-               mptcp_propagate_state(parent, sk);
+               mptcp_propagate_state(parent, sk, subflow, NULL);
        }
  
        /* as recvmsg() does not acquire the subflow socket for ssk selection
diff --combined net/sched/act_mirred.c
index 93a96e9d8d900c238c9a84d75201b6edf01ba198,0a1a9e40f237012ecaa561bd563162bbc1802f9b..6f4bb1c8ce7bdbf465da63214b780265fd0e8ea9
@@@ -533,8 -533,6 +533,6 @@@ static int mirred_device_event(struct n
                                 * net_device are already rcu protected.
                                 */
                                RCU_INIT_POINTER(m->tcfm_dev, NULL);
-                       } else if (m->tcfm_blockid) {
-                               m->tcfm_blockid = 0;
                        }
                        spin_unlock_bh(&m->tcf_lock);
                }
@@@ -643,7 -641,6 +641,7 @@@ static struct tc_action_ops act_mirred_
        .size           =       sizeof(struct tcf_mirred),
        .get_dev        =       tcf_mirred_get_dev,
  };
 +MODULE_ALIAS_NET_ACT("mirred");
  
  static __net_init int mirred_init_net(struct net *net)
  {
diff --combined net/smc/af_smc.c
index 66763c74ab7679be38bf486c70514620c9c1a9cc,0f53a5c6fd9d9c88c78f51640b179bf214e78bda..4b52b3b159c0ec618506988810d56aa71278da9d
@@@ -924,6 -924,7 +924,7 @@@ static int smc_switch_to_fallback(struc
                smc->clcsock->file->private_data = smc->clcsock;
                smc->clcsock->wq.fasync_list =
                        smc->sk.sk_socket->wq.fasync_list;
+               smc->sk.sk_socket->wq.fasync_list = NULL;
  
                /* There might be some wait entries remaining
                 * in smc sk->sk_wq and they should be woken up
@@@ -1045,7 -1046,7 +1046,7 @@@ static int smc_find_ism_v2_device_clnt(
        int rc = SMC_CLC_DECL_NOSMCDDEV;
        struct smcd_dev *smcd;
        int i = 1, entry = 1;
 -      bool is_virtual;
 +      bool is_emulated;
        u16 chid;
  
        if (smcd_indicated(ini->smc_type_v1))
                chid = smc_ism_get_chid(smcd);
                if (!smc_find_ism_v2_is_unique_chid(chid, ini, i))
                        continue;
 -              is_virtual = __smc_ism_is_virtual(chid);
 +              is_emulated = __smc_ism_is_emulated(chid);
                if (!smc_pnet_is_pnetid_set(smcd->pnetid) ||
                    smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) {
 -                      if (is_virtual && entry == SMCD_CLC_MAX_V2_GID_ENTRIES)
 +                      if (is_emulated && entry == SMCD_CLC_MAX_V2_GID_ENTRIES)
                                /* It's the last GID-CHID entry left in CLC
 -                               * Proposal SMC-Dv2 extension, but a virtual
 +                               * Proposal SMC-Dv2 extension, but an Emulated-
                                 * ISM device will take two entries. So give
                                 * up it and try the next potential ISM device.
                                 */
                        ini->is_smcd = true;
                        rc = 0;
                        i++;
 -                      entry = is_virtual ? entry + 2 : entry + 1;
 +                      entry = is_emulated ? entry + 2 : entry + 1;
                        if (entry > SMCD_CLC_MAX_V2_GID_ENTRIES)
                                break;
                }
@@@ -1413,10 -1414,10 +1414,10 @@@ static int smc_connect_ism(struct smc_s
                if (rc)
                        return rc;
  
 -              if (__smc_ism_is_virtual(ini->ism_chid[ini->ism_selected]))
 +              if (__smc_ism_is_emulated(ini->ism_chid[ini->ism_selected]))
                        ini->ism_peer_gid[ini->ism_selected].gid_ext =
                                                ntohll(aclc->d1.gid_ext);
 -              /* for non-virtual ISM devices, peer gid_ext remains 0. */
 +              /* for non-Emulated-ISM devices, peer gid_ext remains 0. */
        }
        ini->ism_peer_gid[ini->ism_selected].gid = ntohll(aclc->d0.gid);
  
@@@ -2117,10 -2118,10 +2118,10 @@@ static void smc_check_ism_v2_match(stru
                if (smc_ism_get_chid(smcd) == proposed_chid &&
                    !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) {
                        ini->ism_peer_gid[*matches].gid = proposed_gid->gid;
 -                      if (__smc_ism_is_virtual(proposed_chid))
 +                      if (__smc_ism_is_emulated(proposed_chid))
                                ini->ism_peer_gid[*matches].gid_ext =
                                                        proposed_gid->gid_ext;
 -                              /* non-virtual ISM's peer gid_ext remains 0. */
 +                              /* non-Emulated-ISM's peer gid_ext remains 0. */
                        ini->ism_dev[*matches] = smcd;
                        (*matches)++;
                        break;
@@@ -2170,10 -2171,10 +2171,10 @@@ static void smc_find_ism_v2_device_serv
                smcd_gid.gid = ntohll(smcd_v2_ext->gidchid[i].gid);
                smcd_gid.gid_ext = 0;
                chid = ntohs(smcd_v2_ext->gidchid[i].chid);
 -              if (__smc_ism_is_virtual(chid)) {
 +              if (__smc_ism_is_emulated(chid)) {
                        if ((i + 1) == smc_v2_ext->hdr.ism_gid_cnt ||
                            chid != ntohs(smcd_v2_ext->gidchid[i + 1].chid))
 -                              /* each virtual ISM device takes two GID-CHID
 +                              /* each Emulated-ISM device takes two GID-CHID
                                 * entries and CHID of the second entry repeats
                                 * that of the first entry.
                                 *
diff --combined net/unix/garbage.c
index 3e4b986de94b95bf3b3aac315c1674191ecb963c,2ff7ddbaa782e341e1614a4d5bd295c87664e7dd..51acf795f096016d132cdeb067214e28d6326614
  #include <net/scm.h>
  #include <net/tcp_states.h>
  
 -#include "scm.h"
 +struct unix_sock *unix_get_socket(struct file *filp)
 +{
 +      struct inode *inode = file_inode(filp);
 +
 +      /* Socket ? */
 +      if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
 +              struct socket *sock = SOCKET_I(inode);
 +              const struct proto_ops *ops;
 +              struct sock *sk = sock->sk;
 +
 +              ops = READ_ONCE(sock->ops);
  
 -/* Internal data structures and random procedures: */
 +              /* PF_UNIX ? */
 +              if (sk && ops && ops->family == PF_UNIX)
 +                      return unix_sk(sk);
 +      }
 +
 +      return NULL;
 +}
  
 +DEFINE_SPINLOCK(unix_gc_lock);
 +unsigned int unix_tot_inflight;
  static LIST_HEAD(gc_candidates);
 -static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
 +static LIST_HEAD(gc_inflight_list);
 +
 +/* Keep the number of times in flight count for the file
 + * descriptor if it is for an AF_UNIX socket.
 + */
 +void unix_inflight(struct user_struct *user, struct file *filp)
 +{
 +      struct unix_sock *u = unix_get_socket(filp);
 +
 +      spin_lock(&unix_gc_lock);
 +
 +      if (u) {
 +              if (!u->inflight) {
 +                      WARN_ON_ONCE(!list_empty(&u->link));
 +                      list_add_tail(&u->link, &gc_inflight_list);
 +              } else {
 +                      WARN_ON_ONCE(list_empty(&u->link));
 +              }
 +              u->inflight++;
 +
 +              /* Paired with READ_ONCE() in wait_for_unix_gc() */
 +              WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1);
 +      }
 +
 +      WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1);
 +
 +      spin_unlock(&unix_gc_lock);
 +}
 +
 +void unix_notinflight(struct user_struct *user, struct file *filp)
 +{
 +      struct unix_sock *u = unix_get_socket(filp);
 +
 +      spin_lock(&unix_gc_lock);
 +
 +      if (u) {
 +              WARN_ON_ONCE(!u->inflight);
 +              WARN_ON_ONCE(list_empty(&u->link));
 +
 +              u->inflight--;
 +              if (!u->inflight)
 +                      list_del_init(&u->link);
 +
 +              /* Paired with READ_ONCE() in wait_for_unix_gc() */
 +              WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
 +      }
 +
 +      WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1);
 +
 +      spin_unlock(&unix_gc_lock);
 +}
  
  static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
                          struct sk_buff_head *hitlist)
  
                        while (nfd--) {
                                /* Get the socket the fd matches if it indeed does so */
 -                              struct sock *sk = unix_get_socket(*fp++);
 -
 -                              if (sk) {
 -                                      struct unix_sock *u = unix_sk(sk);
 +                              struct unix_sock *u = unix_get_socket(*fp++);
  
 -                                      /* Ignore non-candidates, they could
 -                                       * have been added to the queues after
 -                                       * starting the garbage collection
 -                                       */
 -                                      if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
 -                                              hit = true;
 +                              /* Ignore non-candidates, they could have been added
 +                               * to the queues after starting the garbage collection
 +                               */
 +                              if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
 +                                      hit = true;
  
 -                                              func(u);
 -                                      }
 +                                      func(u);
                                }
                        }
                        if (hit && hitlist != NULL) {
@@@ -214,7 -151,7 +214,7 @@@ static void scan_children(struct sock *
                        /* An embryo cannot be in-flight, so it's safe
                         * to use the list link.
                         */
 -                      BUG_ON(!list_empty(&u->link));
 +                      WARN_ON_ONCE(!list_empty(&u->link));
                        list_add_tail(&u->link, &embryos);
                }
                spin_unlock(&x->sk_receive_queue.lock);
  
  static void dec_inflight(struct unix_sock *usk)
  {
 -      atomic_long_dec(&usk->inflight);
 +      usk->inflight--;
  }
  
  static void inc_inflight(struct unix_sock *usk)
  {
 -      atomic_long_inc(&usk->inflight);
 +      usk->inflight++;
  }
  
  static void inc_inflight_move_tail(struct unix_sock *u)
  {
 -      atomic_long_inc(&u->inflight);
 +      u->inflight++;
 +
        /* If this still might be part of a cycle, move it to the end
         * of the list, so that it's checked even if it was already
         * passed over
  }
  
  static bool gc_in_progress;
 -#define UNIX_INFLIGHT_TRIGGER_GC 16000
 -
 -void wait_for_unix_gc(void)
 -{
 -      /* If number of inflight sockets is insane,
 -       * force a garbage collect right now.
 -       * Paired with the WRITE_ONCE() in unix_inflight(),
 -       * unix_notinflight() and gc_in_progress().
 -       */
 -      if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC &&
 -          !READ_ONCE(gc_in_progress))
 -              unix_gc();
 -      wait_event(unix_gc_wait, gc_in_progress == false);
 -}
  
 -/* The external entry point: unix_gc() */
 -void unix_gc(void)
 +static void __unix_gc(struct work_struct *work)
  {
 -      struct sk_buff *next_skb, *skb;
 -      struct unix_sock *u;
 -      struct unix_sock *next;
        struct sk_buff_head hitlist;
 -      struct list_head cursor;
 +      struct unix_sock *u, *next;
        LIST_HEAD(not_cycle_list);
 +      struct list_head cursor;
  
        spin_lock(&unix_gc_lock);
  
 -      /* Avoid a recursive GC. */
 -      if (gc_in_progress)
 -              goto out;
 -
 -      /* Paired with READ_ONCE() in wait_for_unix_gc(). */
 -      WRITE_ONCE(gc_in_progress, true);
 -
        /* First, select candidates for garbage collection.  Only
         * in-flight sockets are considered, and from those only ones
         * which don't have any external reference.
         */
        list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
                long total_refs;
 -              long inflight_refs;
  
                total_refs = file_count(u->sk.sk_socket->file);
 -              inflight_refs = atomic_long_read(&u->inflight);
  
 -              BUG_ON(inflight_refs < 1);
 -              BUG_ON(total_refs < inflight_refs);
 -              if (total_refs == inflight_refs) {
 +              WARN_ON_ONCE(!u->inflight);
 +              WARN_ON_ONCE(total_refs < u->inflight);
 +              if (total_refs == u->inflight) {
                        list_move_tail(&u->link, &gc_candidates);
                        __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
                        __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
                /* Move cursor to after the current position. */
                list_move(&cursor, &u->link);
  
 -              if (atomic_long_read(&u->inflight) > 0) {
 +              if (u->inflight) {
                        list_move_tail(&u->link, &not_cycle_list);
                        __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
                        scan_children(&u->sk, inc_inflight_move_tail, NULL);
  
        spin_unlock(&unix_gc_lock);
  
 -      /* We need io_uring to clean its registered files, ignore all io_uring
 -       * originated skbs. It's fine as io_uring doesn't keep references to
 -       * other io_uring instances and so killing all other files in the cycle
 -       * will put all io_uring references forcing it to go through normal
 -       * release.path eventually putting registered files.
 -       */
 -      skb_queue_walk_safe(&hitlist, skb, next_skb) {
 -              if (skb->destructor == io_uring_destruct_scm) {
 -                      __skb_unlink(skb, &hitlist);
 -                      skb_queue_tail(&skb->sk->sk_receive_queue, skb);
 -              }
 -      }
 -
        /* Here we are. Hitlist is filled. Die. */
        __skb_queue_purge(&hitlist);
  
  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
-       list_for_each_entry_safe(u, next, &gc_candidates, link) {
-               struct sk_buff *skb = u->oob_skb;
+       while (!list_empty(&gc_candidates)) {
+               u = list_entry(gc_candidates.next, struct unix_sock, link);
+               if (u->oob_skb) {
+                       struct sk_buff *skb = u->oob_skb;
  
-               if (skb) {
                        u->oob_skb = NULL;
                        kfree_skb(skb);
                }
  
        spin_lock(&unix_gc_lock);
  
 -      /* There could be io_uring registered files, just push them back to
 -       * the inflight list
 -       */
 -      list_for_each_entry_safe(u, next, &gc_candidates, link)
 -              list_move_tail(&u->link, &gc_inflight_list);
 -
        /* All candidates should have been detached by now. */
 -      BUG_ON(!list_empty(&gc_candidates));
 +      WARN_ON_ONCE(!list_empty(&gc_candidates));
  
        /* Paired with READ_ONCE() in wait_for_unix_gc(). */
        WRITE_ONCE(gc_in_progress, false);
  
 -      wake_up(&unix_gc_wait);
 -
 - out:
        spin_unlock(&unix_gc_lock);
  }
 +
 +static DECLARE_WORK(unix_gc_work, __unix_gc);
 +
 +void unix_gc(void)
 +{
 +      WRITE_ONCE(gc_in_progress, true);
 +      queue_work(system_unbound_wq, &unix_gc_work);
 +}
 +
 +#define UNIX_INFLIGHT_TRIGGER_GC 16000
 +#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8)
 +
 +void wait_for_unix_gc(struct scm_fp_list *fpl)
 +{
 +      /* If number of inflight sockets is insane,
 +       * force a garbage collect right now.
 +       *
 +       * Paired with the WRITE_ONCE() in unix_inflight(),
 +       * unix_notinflight(), and __unix_gc().
 +       */
 +      if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC &&
 +          !READ_ONCE(gc_in_progress))
 +              unix_gc();
 +
 +      /* Penalise users who want to send AF_UNIX sockets
 +       * but whose sockets have not been received yet.
 +       */
 +      if (!fpl || !fpl->count_unix ||
 +          READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER)
 +              return;
 +
 +      if (READ_ONCE(gc_in_progress))
 +              flush_work(&unix_gc_work);
 +}
diff --combined net/xfrm/xfrm_user.c
index dc4f9b8d7cb0fbb4ec3baf23b015c4bf4c916e31,f037be190baeacf8a7fc4c26240dd224a39cb984..a5232dcfea46b594579a6e52346ccb608c7e01b7
@@@ -902,7 -902,7 +902,7 @@@ static void copy_to_user_state(struct x
        memcpy(&p->sel, &x->sel, sizeof(p->sel));
        memcpy(&p->lft, &x->lft, sizeof(p->lft));
        if (x->xso.dev)
 -              xfrm_dev_state_update_curlft(x);
 +              xfrm_dev_state_update_stats(x);
        memcpy(&p->curlft, &x->curlft, sizeof(p->curlft));
        put_unaligned(x->stats.replay_window, &p->stats.replay_window);
        put_unaligned(x->stats.replay, &p->stats.replay);
@@@ -3888,5 -3888,6 +3888,6 @@@ static void __exit xfrm_user_exit(void
  
  module_init(xfrm_user_init);
  module_exit(xfrm_user_exit);
+ MODULE_DESCRIPTION("XFRM User interface");
  MODULE_LICENSE("GPL");
  MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM);
index 87b80bee6df4074d63f537db06478f92ff81b3bc,36e40256ab92a696de62339dd7c7342df3468372..5cae5354384914928dc0f5568cb4ed3948862ec5
@@@ -17,7 -17,6 +17,7 @@@ tests=
        ct_connect_v4                           ip4-ct-xon: Basic ipv4 tcp connection using ct
        connect_v4                              ip4-xon: Basic ipv4 ping between two NS
        nat_connect_v4                          ip4-nat-xon: Basic ipv4 tcp connection via NAT
 +      nat_related_v4                          ip4-nat-related: ICMP related matches work with SNAT
        netlink_checks                          ovsnl: validate netlink attrs and settings
        upcall_interfaces                       ovs: test the upcall interfaces
        drop_reason                             drop: test drop reasons are emitted"
@@@ -474,67 -473,6 +474,67 @@@ test_nat_connect_v4 () 
        return 0
  }
  
 +# nat_related_v4 test
 +#  - client->server ip packets go via SNAT
 +#  - client solicits ICMP destination unreachable packet from server
 +#  - undo NAT for ICMP reply and test dst ip has been updated
 +test_nat_related_v4 () {
 +      which nc >/dev/null 2>/dev/null || return $ksft_skip
 +
 +      sbx_add "test_nat_related_v4" || return $?
 +
 +      ovs_add_dp "test_nat_related_v4" natrelated4 || return 1
 +      info "create namespaces"
 +      for ns in client server; do
 +              ovs_add_netns_and_veths "test_nat_related_v4" "natrelated4" "$ns" \
 +                      "${ns:0:1}0" "${ns:0:1}1" || return 1
 +      done
 +
 +      ip netns exec client ip addr add 172.31.110.10/24 dev c1
 +      ip netns exec client ip link set c1 up
 +      ip netns exec server ip addr add 172.31.110.20/24 dev s1
 +      ip netns exec server ip link set s1 up
 +
 +      ip netns exec server ip route add 192.168.0.20/32 via 172.31.110.10
 +
 +      # Allow ARP
 +      ovs_add_flow "test_nat_related_v4" natrelated4 \
 +              "in_port(1),eth(),eth_type(0x0806),arp()" "2" || return 1
 +      ovs_add_flow "test_nat_related_v4" natrelated4 \
 +              "in_port(2),eth(),eth_type(0x0806),arp()" "1" || return 1
 +
 +      # Allow IP traffic from client->server, rewrite source IP with SNAT to 192.168.0.20
 +      ovs_add_flow "test_nat_related_v4" natrelated4 \
 +              "ct_state(-trk),in_port(1),eth(),eth_type(0x0800),ipv4(dst=172.31.110.20)" \
 +              "ct(commit,nat(src=192.168.0.20)),recirc(0x1)" || return 1
 +      ovs_add_flow "test_nat_related_v4" natrelated4 \
 +              "recirc_id(0x1),ct_state(+trk-inv),in_port(1),eth(),eth_type(0x0800),ipv4()" \
 +              "2" || return 1
 +
 +      # Allow related ICMP responses back from server and undo NAT to restore original IP
 +      # Drop any ICMP related packets where dst ip hasn't been restored back to original IP
 +      ovs_add_flow "test_nat_related_v4" natrelated4 \
 +              "ct_state(-trk),in_port(2),eth(),eth_type(0x0800),ipv4()" \
 +              "ct(commit,nat),recirc(0x2)" || return 1
 +      ovs_add_flow "test_nat_related_v4" natrelated4 \
 +              "recirc_id(0x2),ct_state(+rel+trk),in_port(2),eth(),eth_type(0x0800),ipv4(src=172.31.110.20,dst=172.31.110.10,proto=1),icmp()" \
 +              "1" || return 1
 +      ovs_add_flow "test_nat_related_v4" natrelated4 \
 +              "recirc_id(0x2),ct_state(+rel+trk),in_port(2),eth(),eth_type(0x0800),ipv4(dst=192.168.0.20,proto=1),icmp()" \
 +              "drop" || return 1
 +
 +      # Solicit destination unreachable response from server
 +      ovs_sbx "test_nat_related_v4" ip netns exec client \
 +              bash -c "echo a | nc -u -w 1 172.31.110.20 10000"
 +
 +      # Check to make sure no packets matched the drop rule with incorrect dst ip
 +      python3 "$ovs_base/ovs-dpctl.py" dump-flows natrelated4 \
 +              | grep "drop" | grep "packets:0" >/dev/null || return 1
 +
 +      info "done..."
 +      return 0
 +}
 +
  # netlink_validation
  # - Create a dp
  # - check no warning with "old version" simulation
@@@ -564,7 -502,20 +564,20 @@@ test_netlink_checks () 
            wc -l) == 2 ] || \
              return 1
  
+       info "Checking clone depth"
        ERR_MSG="Flow actions may not be safe on all matching packets"
+       PRE_TEST=$(dmesg | grep -c "${ERR_MSG}")
+       ovs_add_flow "test_netlink_checks" nv0 \
+               'in_port(1),eth(),eth_type(0x800),ipv4()' \
+               'clone(clone(clone(clone(clone(clone(clone(clone(clone(clone(clone(clone(clone(clone(clone(clone(clone(drop)))))))))))))))))' \
+               >/dev/null 2>&1 && return 1
+       POST_TEST=$(dmesg | grep -c "${ERR_MSG}")
+       if [ "$PRE_TEST" == "$POST_TEST" ]; then
+               info "failed - clone depth too large"
+               return 1
+       fi
        PRE_TEST=$(dmesg | grep -c "${ERR_MSG}")
        ovs_add_flow "test_netlink_checks" nv0 \
                'in_port(1),eth(),eth_type(0x0806),arp()' 'drop(0),2' \