Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 16 Jul 2019 03:38:15 +0000 (20:38 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 16 Jul 2019 03:38:15 +0000 (20:38 -0700)
Pull rdma updates from Jason Gunthorpe:
 "A smaller cycle this time. Notably we see another new driver, 'Soft
  iWarp', and the deletion of an ancient unused driver for nes.

   - Revise and simplify the signature offload RDMA MR APIs

   - More progress on hoisting object allocation boiler plate code out
     of the drivers

   - Driver bug fixes and revisions for hns, hfi1, efa, cxgb4, qib,
     i40iw

   - Tree wide cleanups: struct_size, put_user_page, xarray, rst doc
     conversion

   - Removal of obsolete ib_ucm chardev and nes driver

   - netlink based discovery of chardevs and autoloading of the modules
     providing them

   - Move more of the rdamvt/hfi1 uapi to include/uapi/rdma

   - New driver 'siw' for software based iWarp running on top of netdev,
     much like rxe's software RoCE.

   - mlx5 feature to report events in their raw devx format to userspace

   - Expose per-object counters through rdma tool

   - Adaptive interrupt moderation for RDMA (DIM), sharing the DIM core
     from netdev"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (194 commits)
  RMDA/siw: Require a 64 bit arch
  RDMA/siw: Mark expected switch fall-throughs
  RDMA/core: Fix -Wunused-const-variable warnings
  rdma/siw: Remove set but not used variable 's'
  rdma/siw: Add missing dependencies on LIBCRC32C and DMA_VIRT_OPS
  RDMA/siw: Add missing rtnl_lock around access to ifa
  rdma/siw: Use proper enumerated type in map_cqe_status
  RDMA/siw: Remove unnecessary kthread create/destroy printouts
  IB/rdmavt: Fix variable shadowing issue in rvt_create_cq
  RDMA/core: Fix race when resolving IP address
  RDMA/core: Make rdma_counter.h compile stand alone
  IB/core: Work on the caller socket net namespace in nldev_newlink()
  RDMA/rxe: Fill in wc byte_len with IB_WC_RECV_RDMA_WITH_IMM
  RDMA/mlx5: Set RDMA DIM to be enabled by default
  RDMA/nldev: Added configuration of RDMA dynamic interrupt moderation to netlink
  RDMA/core: Provide RDMA DIM support for ULPs
  linux/dim: Implement RDMA adaptive moderation (DIM)
  IB/mlx5: Report correctly tag matching rendezvous capability
  docs: infiniband: add it to the driver-api bookset
  IB/mlx5: Implement VHCA tunnel mechanism in DEVX
  ...

221 files changed:
Documentation/ABI/stable/sysfs-class-infiniband
Documentation/index.rst
Documentation/infiniband/core_locking.rst [moved from Documentation/infiniband/core_locking.txt with 78% similarity]
Documentation/infiniband/index.rst [new file with mode: 0644]
Documentation/infiniband/ipoib.rst [moved from Documentation/infiniband/ipoib.txt with 90% similarity]
Documentation/infiniband/opa_vnic.rst [moved from Documentation/infiniband/opa_vnic.txt with 63% similarity]
Documentation/infiniband/sysfs.rst [moved from Documentation/infiniband/sysfs.txt with 69% similarity]
Documentation/infiniband/tag_matching.rst [moved from Documentation/infiniband/tag_matching.txt with 98% similarity]
Documentation/infiniband/user_mad.rst [moved from Documentation/infiniband/user_mad.txt with 90% similarity]
Documentation/infiniband/user_verbs.rst [moved from Documentation/infiniband/user_verbs.txt with 93% similarity]
MAINTAINERS
drivers/infiniband/Kconfig
drivers/infiniband/core/Makefile
drivers/infiniband/core/addr.c
drivers/infiniband/core/core_priv.h
drivers/infiniband/core/counters.c [new file with mode: 0644]
drivers/infiniband/core/cq.c
drivers/infiniband/core/device.c
drivers/infiniband/core/mr_pool.c
drivers/infiniband/core/nldev.c
drivers/infiniband/core/restrack.c
drivers/infiniband/core/restrack.h
drivers/infiniband/core/rw.c
drivers/infiniband/core/sysfs.c
drivers/infiniband/core/ucm.c [deleted file]
drivers/infiniband/core/ucma.c
drivers/infiniband/core/umem.c
drivers/infiniband/core/umem_odp.c
drivers/infiniband/core/user_mad.c
drivers/infiniband/core/uverbs_cmd.c
drivers/infiniband/core/uverbs_main.c
drivers/infiniband/core/uverbs_std_types_cq.c
drivers/infiniband/core/uverbs_std_types_mr.c
drivers/infiniband/core/uverbs_uapi.c
drivers/infiniband/core/verbs.c
drivers/infiniband/hw/Makefile
drivers/infiniband/hw/bnxt_re/ib_verbs.c
drivers/infiniband/hw/bnxt_re/ib_verbs.h
drivers/infiniband/hw/bnxt_re/main.c
drivers/infiniband/hw/cxgb3/cxio_hal.c
drivers/infiniband/hw/cxgb3/cxio_hal.h
drivers/infiniband/hw/cxgb3/iwch_cm.c
drivers/infiniband/hw/cxgb3/iwch_provider.c
drivers/infiniband/hw/cxgb4/cm.c
drivers/infiniband/hw/cxgb4/cq.c
drivers/infiniband/hw/cxgb4/device.c
drivers/infiniband/hw/cxgb4/iw_cxgb4.h
drivers/infiniband/hw/cxgb4/mem.c
drivers/infiniband/hw/cxgb4/provider.c
drivers/infiniband/hw/cxgb4/qp.c
drivers/infiniband/hw/cxgb4/resource.c
drivers/infiniband/hw/efa/efa.h
drivers/infiniband/hw/efa/efa_com.c
drivers/infiniband/hw/efa/efa_com.h
drivers/infiniband/hw/efa/efa_com_cmd.c
drivers/infiniband/hw/efa/efa_main.c
drivers/infiniband/hw/efa/efa_verbs.c
drivers/infiniband/hw/hfi1/Makefile
drivers/infiniband/hw/hfi1/aspm.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/aspm.h
drivers/infiniband/hw/hfi1/debugfs.c
drivers/infiniband/hw/hfi1/mad.c
drivers/infiniband/hw/hfi1/pcie.c
drivers/infiniband/hw/hfi1/pio.c
drivers/infiniband/hw/hfi1/qp.c
drivers/infiniband/hw/hfi1/rc.c
drivers/infiniband/hw/hfi1/tid_rdma.c
drivers/infiniband/hw/hfi1/trace_ibhdrs.h
drivers/infiniband/hw/hfi1/uc.c
drivers/infiniband/hw/hfi1/ud.c
drivers/infiniband/hw/hfi1/user_pages.c
drivers/infiniband/hw/hfi1/verbs.c
drivers/infiniband/hw/hns/Kconfig
drivers/infiniband/hw/hns/Makefile
drivers/infiniband/hw/hns/hns_roce_alloc.c
drivers/infiniband/hw/hns/hns_roce_cmd.c
drivers/infiniband/hw/hns/hns_roce_cq.c
drivers/infiniband/hw/hns/hns_roce_db.c
drivers/infiniband/hw/hns/hns_roce_device.h
drivers/infiniband/hw/hns/hns_roce_hem.c
drivers/infiniband/hw/hns/hns_roce_hem.h
drivers/infiniband/hw/hns/hns_roce_hw_v1.c
drivers/infiniband/hw/hns/hns_roce_hw_v2.c
drivers/infiniband/hw/hns/hns_roce_hw_v2.h
drivers/infiniband/hw/hns/hns_roce_main.c
drivers/infiniband/hw/hns/hns_roce_mr.c
drivers/infiniband/hw/hns/hns_roce_pd.c
drivers/infiniband/hw/hns/hns_roce_qp.c
drivers/infiniband/hw/hns/hns_roce_srq.c
drivers/infiniband/hw/i40iw/i40iw_cm.c
drivers/infiniband/hw/i40iw/i40iw_verbs.c
drivers/infiniband/hw/mlx4/cq.c
drivers/infiniband/hw/mlx4/main.c
drivers/infiniband/hw/mlx4/mlx4_ib.h
drivers/infiniband/hw/mlx4/mr.c
drivers/infiniband/hw/mlx4/qp.c
drivers/infiniband/hw/mlx4/srq.c
drivers/infiniband/hw/mlx5/cq.c
drivers/infiniband/hw/mlx5/devx.c
drivers/infiniband/hw/mlx5/mad.c
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/mem.c
drivers/infiniband/hw/mlx5/mlx5_ib.h
drivers/infiniband/hw/mlx5/mr.c
drivers/infiniband/hw/mlx5/odp.c
drivers/infiniband/hw/mlx5/qp.c
drivers/infiniband/hw/mthca/mthca_allocator.c
drivers/infiniband/hw/mthca/mthca_memfree.c
drivers/infiniband/hw/mthca/mthca_provider.c
drivers/infiniband/hw/nes/Kconfig [deleted file]
drivers/infiniband/hw/nes/Makefile [deleted file]
drivers/infiniband/hw/nes/nes.c [deleted file]
drivers/infiniband/hw/nes/nes.h [deleted file]
drivers/infiniband/hw/nes/nes_cm.c [deleted file]
drivers/infiniband/hw/nes/nes_cm.h [deleted file]
drivers/infiniband/hw/nes/nes_context.h [deleted file]
drivers/infiniband/hw/nes/nes_hw.c [deleted file]
drivers/infiniband/hw/nes/nes_hw.h [deleted file]
drivers/infiniband/hw/nes/nes_mgt.c [deleted file]
drivers/infiniband/hw/nes/nes_mgt.h [deleted file]
drivers/infiniband/hw/nes/nes_nic.c [deleted file]
drivers/infiniband/hw/nes/nes_utils.c [deleted file]
drivers/infiniband/hw/nes/nes_verbs.c [deleted file]
drivers/infiniband/hw/nes/nes_verbs.h [deleted file]
drivers/infiniband/hw/ocrdma/ocrdma_hw.c
drivers/infiniband/hw/ocrdma/ocrdma_hw.h
drivers/infiniband/hw/ocrdma/ocrdma_main.c
drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
drivers/infiniband/hw/qedr/main.c
drivers/infiniband/hw/qedr/verbs.c
drivers/infiniband/hw/qedr/verbs.h
drivers/infiniband/hw/qib/qib_qp.c
drivers/infiniband/hw/qib/qib_rc.c
drivers/infiniband/hw/qib/qib_uc.c
drivers/infiniband/hw/qib/qib_ud.c
drivers/infiniband/hw/qib/qib_user_pages.c
drivers/infiniband/hw/qib/qib_user_sdma.c
drivers/infiniband/hw/qib/qib_verbs.c
drivers/infiniband/hw/usnic/usnic_ib.h
drivers/infiniband/hw/usnic/usnic_ib_main.c
drivers/infiniband/hw/usnic/usnic_ib_verbs.c
drivers/infiniband/hw/usnic/usnic_ib_verbs.h
drivers/infiniband/hw/usnic/usnic_uiom.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
drivers/infiniband/sw/Makefile
drivers/infiniband/sw/rdmavt/ah.c
drivers/infiniband/sw/rdmavt/cq.c
drivers/infiniband/sw/rdmavt/cq.h
drivers/infiniband/sw/rdmavt/mr.c
drivers/infiniband/sw/rdmavt/qp.c
drivers/infiniband/sw/rdmavt/qp.h
drivers/infiniband/sw/rdmavt/rc.c
drivers/infiniband/sw/rdmavt/srq.c
drivers/infiniband/sw/rdmavt/trace_mr.h
drivers/infiniband/sw/rdmavt/vt.c
drivers/infiniband/sw/rdmavt/vt.h
drivers/infiniband/sw/rxe/rxe_comp.c
drivers/infiniband/sw/rxe/rxe_mr.c
drivers/infiniband/sw/rxe/rxe_pool.c
drivers/infiniband/sw/rxe/rxe_resp.c
drivers/infiniband/sw/rxe/rxe_verbs.c
drivers/infiniband/sw/rxe/rxe_verbs.h
drivers/infiniband/sw/siw/Kconfig [new file with mode: 0644]
drivers/infiniband/sw/siw/Makefile [new file with mode: 0644]
drivers/infiniband/sw/siw/iwarp.h [new file with mode: 0644]
drivers/infiniband/sw/siw/siw.h [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_cm.c [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_cm.h [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_cq.c [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_main.c [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_mem.c [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_mem.h [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_qp.c [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_qp_rx.c [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_qp_tx.c [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_verbs.c [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_verbs.h [new file with mode: 0644]
drivers/infiniband/ulp/ipoib/Kconfig
drivers/infiniband/ulp/ipoib/ipoib_cm.c
drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/infiniband/ulp/ipoib/ipoib_verbs.c
drivers/infiniband/ulp/iser/iscsi_iser.c
drivers/infiniband/ulp/iser/iscsi_iser.h
drivers/infiniband/ulp/iser/iser_initiator.c
drivers/infiniband/ulp/iser/iser_memory.c
drivers/infiniband/ulp/iser/iser_verbs.c
drivers/infiniband/ulp/isert/ib_isert.c
drivers/infiniband/ulp/srp/ib_srp.c
drivers/nvme/host/rdma.c
include/linux/dim.h
include/linux/mlx5/mlx5_ifc.h
include/linux/mlx5/qp.h
include/rdma/ib_umem.h
include/rdma/ib_umem_odp.h
include/rdma/ib_verbs.h
include/rdma/mr_pool.h
include/rdma/rdma_counter.h [new file with mode: 0644]
include/rdma/rdma_netlink.h
include/rdma/rdma_vt.h
include/rdma/rdmavt_cq.h
include/rdma/rdmavt_qp.h
include/rdma/restrack.h
include/rdma/rw.h
include/rdma/signature.h [new file with mode: 0644]
include/uapi/rdma/ib_user_cm.h [deleted file]
include/uapi/rdma/mlx5_user_ioctl_cmds.h
include/uapi/rdma/mlx5_user_ioctl_verbs.h
include/uapi/rdma/rdma_netlink.h
include/uapi/rdma/rdma_user_ioctl_cmds.h
include/uapi/rdma/rvt-abi.h [new file with mode: 0644]
include/uapi/rdma/siw-abi.h [new file with mode: 0644]
lib/dim/Makefile
lib/dim/rdma_dim.c [new file with mode: 0644]
net/rds/ib_cm.c

index 17211ceb9bf43876b886fe00ae08d4fcec74fe66..aed21b8916a25ac82767c64fae32601352af239b 100644 (file)
@@ -423,23 +423,6 @@ Description:
                (e.g. driver restart on the VM which owns the VF).
 
 
-sysfs interface for NetEffect RNIC Low-Level iWARP driver (nes)
----------------------------------------------------------------
-
-What:          /sys/class/infiniband/nesX/hw_rev
-What:          /sys/class/infiniband/nesX/hca_type
-What:          /sys/class/infiniband/nesX/board_id
-Date:          Feb, 2008
-KernelVersion: v2.6.25
-Contact:       linux-rdma@vger.kernel.org
-Description:
-               hw_rev:         (RO) Hardware revision number
-
-               hca_type:       (RO) Host Channel Adapter type (NEX020)
-
-               board_id:       (RO) Manufacturing board id
-
-
 sysfs interface for Chelsio T4/T5 RDMA driver (cxgb4)
 -----------------------------------------------------
 
index 216dc0e1e6f2efca39167785e7d32e2078f0b183..71a77feb779b6a7a104b3d59d4163f409b909ebf 100644 (file)
@@ -90,6 +90,7 @@ needed).
 
    driver-api/index
    core-api/index
+   infiniband/index
    media/index
    networking/index
    input/index
similarity index 78%
rename from Documentation/infiniband/core_locking.txt
rename to Documentation/infiniband/core_locking.rst
index 4b1f36b6ada034000d3d8e80831f79b4921f0ab4..f34669beb4fe0d1895377873218d8d82428ac4a2 100644 (file)
@@ -1,4 +1,6 @@
-INFINIBAND MIDLAYER LOCKING
+===========================
+InfiniBand Midlayer Locking
+===========================
 
   This guide is an attempt to make explicit the locking assumptions
   made by the InfiniBand midlayer.  It describes the requirements on
@@ -6,45 +8,47 @@ INFINIBAND MIDLAYER LOCKING
   protocols that use the midlayer.
 
 Sleeping and interrupt context
+==============================
 
   With the following exceptions, a low-level driver implementation of
   all of the methods in struct ib_device may sleep.  The exceptions
   are any methods from the list:
 
-    create_ah
-    modify_ah
-    query_ah
-    destroy_ah
-    post_send
-    post_recv
-    poll_cq
-    req_notify_cq
-    map_phys_fmr
+    create_ah
+    modify_ah
+    query_ah
+    destroy_ah
+    post_send
+    post_recv
+    poll_cq
+    req_notify_cq
+    map_phys_fmr
 
   which may not sleep and must be callable from any context.
 
   The corresponding functions exported to upper level protocol
   consumers:
 
-    ib_create_ah
-    ib_modify_ah
-    ib_query_ah
-    ib_destroy_ah
-    ib_post_send
-    ib_post_recv
-    ib_req_notify_cq
-    ib_map_phys_fmr
+    ib_create_ah
+    ib_modify_ah
+    ib_query_ah
+    ib_destroy_ah
+    ib_post_send
+    ib_post_recv
+    ib_req_notify_cq
+    ib_map_phys_fmr
 
   are therefore safe to call from any context.
 
   In addition, the function
 
-    ib_dispatch_event
+    ib_dispatch_event
 
   used by low-level drivers to dispatch asynchronous events through
   the midlayer is also safe to call from any context.
 
 Reentrancy
+----------
 
   All of the methods in struct ib_device exported by a low-level
   driver must be fully reentrant.  The low-level driver is required to
@@ -62,6 +66,7 @@ Reentrancy
   information between different calls of ib_poll_cq() is not defined.
 
 Callbacks
+---------
 
   A low-level driver must not perform a callback directly from the
   same callchain as an ib_device method call.  For example, it is not
@@ -74,18 +79,18 @@ Callbacks
   completion event handlers for the same CQ are not called
   simultaneously.  The driver must guarantee that only one CQ event
   handler for a given CQ is running at a time.  In other words, the
-  following situation is not allowed:
+  following situation is not allowed::
 
-        CPU1                                    CPU2
+          CPU1                                    CPU2
 
-  low-level driver ->
-    consumer CQ event callback:
-      /* ... */
-      ib_req_notify_cq(cq, ...);
-                                        low-level driver ->
-      /* ... */                           consumer CQ event callback:
-                                            /* ... */
-      return from CQ event handler
+    low-level driver ->
+      consumer CQ event callback:
+        /* ... */
+        ib_req_notify_cq(cq, ...);
+                                          low-level driver ->
+        /* ... */                           consumer CQ event callback:
+                                              /* ... */
+        return from CQ event handler
 
   The context in which completion event and asynchronous event
   callbacks run is not defined.  Depending on the low-level driver, it
@@ -93,6 +98,7 @@ Callbacks
   Upper level protocol consumers may not sleep in a callback.
 
 Hot-plug
+--------
 
   A low-level driver announces that a device is ready for use by
   consumers when it calls ib_register_device(), all initialization
diff --git a/Documentation/infiniband/index.rst b/Documentation/infiniband/index.rst
new file mode 100644 (file)
index 0000000..9cd7615
--- /dev/null
@@ -0,0 +1,23 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========
+InfiniBand
+==========
+
+.. toctree::
+   :maxdepth: 1
+
+   core_locking
+   ipoib
+   opa_vnic
+   sysfs
+   tag_matching
+   user_mad
+   user_verbs
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
similarity index 90%
rename from Documentation/infiniband/ipoib.txt
rename to Documentation/infiniband/ipoib.rst
index 47c1dd9818f2e5e696a34103333867c98bb2c5d8..0dd36154c0c909e151b6590ac420b7509f469cb1 100644 (file)
@@ -1,4 +1,6 @@
-IP OVER INFINIBAND
+==================
+IP over InfiniBand
+==================
 
   The ib_ipoib driver is an implementation of the IP over InfiniBand
   protocol as specified by RFC 4391 and 4392, issued by the IETF ipoib
@@ -8,16 +10,17 @@ IP OVER INFINIBAND
   masqueraded to the kernel as ethernet interfaces).
 
 Partitions and P_Keys
+=====================
 
   When the IPoIB driver is loaded, it creates one interface for each
   port using the P_Key at index 0.  To create an interface with a
   different P_Key, write the desired P_Key into the main interface's
-  /sys/class/net/<intf name>/create_child file.  For example:
+  /sys/class/net/<intf name>/create_child file.  For example::
 
     echo 0x8001 > /sys/class/net/ib0/create_child
 
   This will create an interface named ib0.8001 with P_Key 0x8001.  To
-  remove a subinterface, use the "delete_child" file:
+  remove a subinterface, use the "delete_child" file::
 
     echo 0x8001 > /sys/class/net/ib0/delete_child
 
@@ -28,6 +31,7 @@ Partitions and P_Keys
   rtnl_link_ops, where children created using either way behave the same.
 
 Datagram vs Connected modes
+===========================
 
   The IPoIB driver supports two modes of operation: datagram and
   connected.  The mode is set and read through an interface's
@@ -51,6 +55,7 @@ Datagram vs Connected modes
   networking stack to use the smaller UD MTU for these neighbours.
 
 Stateless offloads
+==================
 
   If the IB HW supports IPoIB stateless offloads, IPoIB advertises
   TCP/IP checksum and/or Large Send (LSO) offloading capability to the
@@ -60,9 +65,10 @@ Stateless offloads
   on/off using ethtool calls.  Currently LRO is supported only for
   checksum offload capable devices.
 
-  Stateless offloads are supported only in datagram mode.  
+  Stateless offloads are supported only in datagram mode.
 
 Interrupt moderation
+====================
 
   If the underlying IB device supports CQ event moderation, one can
   use ethtool to set interrupt mitigation parameters and thus reduce
@@ -71,6 +77,7 @@ Interrupt moderation
   moderation is supported.
 
 Debugging Information
+=====================
 
   By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set
   to 'y', tracing messages are compiled into the driver.  They are
@@ -79,7 +86,7 @@ Debugging Information
   runtime through files in /sys/module/ib_ipoib/.
 
   CONFIG_INFINIBAND_IPOIB_DEBUG also enables files in the debugfs
-  virtual filesystem.  By mounting this filesystem, for example with
+  virtual filesystem.  By mounting this filesystem, for example with::
 
     mount -t debugfs none /sys/kernel/debug
 
@@ -96,10 +103,13 @@ Debugging Information
   performance, because it adds tests to the fast path.
 
 References
+==========
 
   Transmission of IP over InfiniBand (IPoIB) (RFC 4391)
-    http://ietf.org/rfc/rfc4391.txt 
+    http://ietf.org/rfc/rfc4391.txt
+
   IP over InfiniBand (IPoIB) Architecture (RFC 4392)
-    http://ietf.org/rfc/rfc4392.txt 
+    http://ietf.org/rfc/rfc4392.txt
+
   IP over InfiniBand: Connected Mode (RFC 4755)
     http://ietf.org/rfc/rfc4755.txt
similarity index 63%
rename from Documentation/infiniband/opa_vnic.txt
rename to Documentation/infiniband/opa_vnic.rst
index 282e17be798a9cb4985b37f01e65252af0dc0362..2f888d9ffec0275e2d4b1f6aa9713b50ddc8d98b 100644 (file)
@@ -1,3 +1,7 @@
+=================================================================
+Intel Omni-Path (OPA) Virtual Network Interface Controller (VNIC)
+=================================================================
+
 Intel Omni-Path (OPA) Virtual Network Interface Controller (VNIC) feature
 supports Ethernet functionality over Omni-Path fabric by encapsulating
 the Ethernet packets between HFI nodes.
@@ -17,70 +21,72 @@ an independent Ethernet network. The configuration is performed by an
 Ethernet Manager (EM) which is part of the trusted Fabric Manager (FM)
 application. HFI nodes can have multiple VNICs each connected to a
 different virtual Ethernet switch. The below diagram presents a case
-of two virtual Ethernet switches with two HFI nodes.
-
-                             +-------------------+
-                             |      Subnet/      |
-                             |     Ethernet      |
-                             |      Manager      |
-                             +-------------------+
-                                /          /
-                              /           /
-                            /            /
-                          /             /
-+-----------------------------+  +------------------------------+
-|  Virtual Ethernet Switch    |  |  Virtual Ethernet Switch     |
-|  +---------+    +---------+ |  | +---------+    +---------+   |
-|  | VPORT   |    |  VPORT  | |  | |  VPORT  |    |  VPORT  |   |
-+--+---------+----+---------+-+  +-+---------+----+---------+---+
-         |                 \        /                 |
-         |                   \    /                   |
-         |                     \/                     |
-         |                    /  \                    |
-         |                  /      \                  |
-     +-----------+------------+  +-----------+------------+
-     |   VNIC    |    VNIC    |  |    VNIC   |    VNIC    |
-     +-----------+------------+  +-----------+------------+
-     |          HFI           |  |          HFI           |
-     +------------------------+  +------------------------+
+of two virtual Ethernet switches with two HFI nodes::
+
+                               +-------------------+
+                               |      Subnet/      |
+                               |     Ethernet      |
+                               |      Manager      |
+                               +-------------------+
+                                  /          /
+                                /           /
+                              /            /
+                            /             /
+  +-----------------------------+  +------------------------------+
+  |  Virtual Ethernet Switch    |  |  Virtual Ethernet Switch     |
+  |  +---------+    +---------+ |  | +---------+    +---------+   |
+  |  | VPORT   |    |  VPORT  | |  | |  VPORT  |    |  VPORT  |   |
+  +--+---------+----+---------+-+  +-+---------+----+---------+---+
+           |                 \        /                 |
+           |                   \    /                   |
+           |                     \/                     |
+           |                    /  \                    |
+           |                  /      \                  |
+       +-----------+------------+  +-----------+------------+
+       |   VNIC    |    VNIC    |  |    VNIC   |    VNIC    |
+       +-----------+------------+  +-----------+------------+
+       |          HFI           |  |          HFI           |
+       +------------------------+  +------------------------+
 
 
 The Omni-Path encapsulated Ethernet packet format is as described below.
 
-Bits          Field
-------------------------------------
+==================== ================================
+Bits                 Field
+==================== ================================
 Quad Word 0:
-0-19      SLID (lower 20 bits)
-20-30     Length (in Quad Words)
-31        BECN bit
-32-51     DLID (lower 20 bits)
-52-56     SC (Service Class)
-57-59     RC (Routing Control)
-60        FECN bit
-61-62     L2 (=10, 16B format)
-63        LT (=1, Link Transfer Head Flit)
+0-19                 SLID (lower 20 bits)
+20-30                Length (in Quad Words)
+31                   BECN bit
+32-51                DLID (lower 20 bits)
+52-56                SC (Service Class)
+57-59                RC (Routing Control)
+60                   FECN bit
+61-62                L2 (=10, 16B format)
+63                   LT (=1, Link Transfer Head Flit)
 
 Quad Word 1:
-0-7       L4 type (=0x78 ETHERNET)
-8-11      SLID[23:20]
-12-15     DLID[23:20]
-16-31     PKEY
-32-47     Entropy
-48-63     Reserved
+0-7                  L4 type (=0x78 ETHERNET)
+8-11                 SLID[23:20]
+12-15                DLID[23:20]
+16-31                PKEY
+32-47                Entropy
+48-63                Reserved
 
 Quad Word 2:
-0-15      Reserved
-16-31     L4 header
-32-63     Ethernet Packet
+0-15                 Reserved
+16-31                L4 header
+32-63                Ethernet Packet
 
 Quad Words 3 to N-1:
-0-63      Ethernet packet (pad extended)
+0-63                 Ethernet packet (pad extended)
 
 Quad Word N (last):
-0-23      Ethernet packet (pad extended)
-24-55     ICRC
-56-61     Tail
-62-63     LT (=01, Link Transfer Tail Flit)
+0-23                 Ethernet packet (pad extended)
+24-55                ICRC
+56-61                Tail
+62-63                LT (=01, Link Transfer Tail Flit)
+==================== ================================
 
 Ethernet packet is padded on the transmit side to ensure that the VNIC OPA
 packet is quad word aligned. The 'Tail' field contains the number of bytes
@@ -123,7 +129,7 @@ operation. It also handles the encapsulation of Ethernet packets with an
 Omni-Path header in the transmit path. For each VNIC interface, the
 information required for encapsulation is configured by the EM via VEMA MAD
 interface. It also passes any control information to the HW dependent driver
-by invoking the RDMA netdev control operations.
+by invoking the RDMA netdev control operations::
 
         +-------------------+ +----------------------+
         |                   | |       Linux          |
similarity index 69%
rename from Documentation/infiniband/sysfs.txt
rename to Documentation/infiniband/sysfs.rst
index 9fab5062f84bad40c6ae140d49ea645b64c0c45c..f0abd6fa48f44c1104ef168f3383264ea6517cbd 100644 (file)
@@ -1,4 +1,6 @@
-SYSFS FILES
+===========
+Sysfs files
+===========
 
 The sysfs interface has moved to
 Documentation/ABI/stable/sysfs-class-infiniband.
similarity index 98%
rename from Documentation/infiniband/tag_matching.txt
rename to Documentation/infiniband/tag_matching.rst
index d2a3bf81922603ac9d210baa7bee37aab3b9c426..ef56ea585f928ce07ce38559b6528217846783a1 100644 (file)
@@ -1,12 +1,16 @@
+==================
 Tag matching logic
+==================
 
 The MPI standard defines a set of rules, known as tag-matching, for matching
 source send operations to destination receives.  The following parameters must
 match the following source and destination parameters:
+
 *      Communicator
 *      User tag - wild card may be specified by the receiver
 *      Source rank – wild car may be specified by the receiver
 *      Destination rank – wild
+
 The ordering rules require that when more than one pair of send and receive
 message envelopes may match, the pair that includes the earliest posted-send
 and the earliest posted-receive is the pair that must be used to satisfy the
@@ -35,6 +39,7 @@ the header to initiate an RDMA READ operation directly to the matching buffer.
 A fin message needs to be received in order for the buffer to be reused.
 
 Tag matching implementation
+===========================
 
 There are two types of matching objects used, the posted receive list and the
 unexpected message list. The application posts receive buffers through calls
similarity index 90%
rename from Documentation/infiniband/user_mad.txt
rename to Documentation/infiniband/user_mad.rst
index 7aca13a54a3a2beadfe53589b74cdf91af2d5cb5..d88abfc0e3700ad734d9ed13befc81beab1495f2 100644 (file)
@@ -1,6 +1,9 @@
-USERSPACE MAD ACCESS
+====================
+Userspace MAD access
+====================
 
 Device files
+============
 
   Each port of each InfiniBand device has a "umad" device and an
   "issm" device attached.  For example, a two-port HCA will have two
@@ -8,12 +11,13 @@ Device files
   device of each type (for switch port 0).
 
 Creating MAD agents
+===================
 
   A MAD agent can be created by filling in a struct ib_user_mad_reg_req
   and then calling the IB_USER_MAD_REGISTER_AGENT ioctl on a file
   descriptor for the appropriate device file.  If the registration
   request succeeds, a 32-bit id will be returned in the structure.
-  For example:
+  For example::
 
        struct ib_user_mad_reg_req req = { /* ... */ };
        ret = ioctl(fd, IB_USER_MAD_REGISTER_AGENT, (char *) &req);
@@ -26,12 +30,14 @@ Creating MAD agents
   ioctl.  Also, all agents registered through a file descriptor will
   be unregistered when the descriptor is closed.
 
-  2014 -- a new registration ioctl is now provided which allows additional
+  2014
+       a new registration ioctl is now provided which allows additional
        fields to be provided during registration.
        Users of this registration call are implicitly setting the use of
        pkey_index (see below).
 
 Receiving MADs
+==============
 
   MADs are received using read().  The receive side now supports
   RMPP. The buffer passed to read() must be at least one
@@ -41,7 +47,8 @@ Receiving MADs
   MAD (RMPP), the errno is set to ENOSPC and the length of the
   buffer needed is set in mad.length.
 
-  Example for normal MAD (non RMPP) reads:
+  Example for normal MAD (non RMPP) reads::
+
        struct ib_user_mad *mad;
        mad = malloc(sizeof *mad + 256);
        ret = read(fd, mad, sizeof *mad + 256);
@@ -50,7 +57,8 @@ Receiving MADs
                free(mad);
        }
 
-  Example for RMPP reads:
+  Example for RMPP reads::
+
        struct ib_user_mad *mad;
        mad = malloc(sizeof *mad + 256);
        ret = read(fd, mad, sizeof *mad + 256);
@@ -76,11 +84,12 @@ Receiving MADs
   poll()/select() may be used to wait until a MAD can be read.
 
 Sending MADs
+============
 
   MADs are sent using write().  The agent ID for sending should be
   filled into the id field of the MAD, the destination LID should be
   filled into the lid field, and so on.  The send side does support
-  RMPP so arbitrary length MAD can be sent. For example:
+  RMPP so arbitrary length MAD can be sent. For example::
 
        struct ib_user_mad *mad;
 
@@ -97,6 +106,7 @@ Sending MADs
                perror("write");
 
 Transaction IDs
+===============
 
   Users of the umad devices can use the lower 32 bits of the
   transaction ID field (that is, the least significant half of the
@@ -105,6 +115,7 @@ Transaction IDs
   the kernel and will be overwritten before a MAD is sent.
 
 P_Key Index Handling
+====================
 
   The old ib_umad interface did not allow setting the P_Key index for
   MADs that are sent and did not provide a way for obtaining the P_Key
@@ -119,6 +130,7 @@ P_Key Index Handling
   default, and the IB_USER_MAD_ENABLE_PKEY ioctl will be removed.
 
 Setting IsSM Capability Bit
+===========================
 
   To set the IsSM capability bit for a port, simply open the
   corresponding issm device file.  If the IsSM bit is already set,
@@ -129,25 +141,26 @@ Setting IsSM Capability Bit
   the issm file.
 
 /dev files
+==========
 
   To create the appropriate character device files automatically with
-  udev, a rule like
+  udev, a rule like::
 
     KERNEL=="umad*", NAME="infiniband/%k"
     KERNEL=="issm*", NAME="infiniband/%k"
 
-  can be used.  This will create device nodes named
+  can be used.  This will create device nodes named::
 
     /dev/infiniband/umad0
     /dev/infiniband/issm0
 
   for the first port, and so on.  The InfiniBand device and port
-  associated with these devices can be determined from the files
+  associated with these devices can be determined from the files::
 
     /sys/class/infiniband_mad/umad0/ibdev
     /sys/class/infiniband_mad/umad0/port
 
-  and
+  and::
 
     /sys/class/infiniband_mad/issm0/ibdev
     /sys/class/infiniband_mad/issm0/port
similarity index 93%
rename from Documentation/infiniband/user_verbs.txt
rename to Documentation/infiniband/user_verbs.rst
index 47ebf2f80b2bd3089c15b30a6983356b939a2c86..8ddc4b1cfef2734224a3eb7e3dadcbbec7c0f45b 100644 (file)
@@ -1,4 +1,6 @@
-USERSPACE VERBS ACCESS
+======================
+Userspace verbs access
+======================
 
   The ib_uverbs module, built by enabling CONFIG_INFINIBAND_USER_VERBS,
   enables direct userspace access to IB hardware via "verbs," as
@@ -13,6 +15,7 @@ USERSPACE VERBS ACCESS
   libmthca userspace driver be installed.
 
 User-kernel communication
+=========================
 
   Userspace communicates with the kernel for slow path, resource
   management operations via the /dev/infiniband/uverbsN character
@@ -28,6 +31,7 @@ User-kernel communication
   system call.
 
 Resource management
+===================
 
   Since creation and destruction of all IB resources is done by
   commands passed through a file descriptor, the kernel can keep track
@@ -41,6 +45,7 @@ Resource management
   prevent one process from touching another process's resources.
 
 Memory pinning
+==============
 
   Direct userspace I/O requires that memory regions that are potential
   I/O targets be kept resident at the same physical address.  The
@@ -54,13 +59,14 @@ Memory pinning
   number of pages pinned by a process.
 
 /dev files
+==========
 
   To create the appropriate character device files automatically with
-  udev, a rule like
+  udev, a rule like::
 
     KERNEL=="uverbs*", NAME="infiniband/%k"
 
-  can be used.  This will create device nodes named
+  can be used.  This will create device nodes named::
 
     /dev/infiniband/uverbs0
 
index a7901e1df0584166f9d777e5e84a418a614a239b..350bb27a1c25f77c713987a3c464757281dfc719 100644 (file)
@@ -11018,14 +11018,6 @@ F:     driver/net/net_failover.c
 F:     include/net/net_failover.h
 F:     Documentation/networking/net_failover.rst
 
-NETEFFECT IWARP RNIC DRIVER (IW_NES)
-M:     Faisal Latif <faisal.latif@intel.com>
-L:     linux-rdma@vger.kernel.org
-W:     http://www.intel.com/Products/Server/Adapters/Server-Cluster/Server-Cluster-overview.htm
-S:     Supported
-F:     drivers/infiniband/hw/nes/
-F:     include/uapi/rdma/nes-abi.h
-
 NETEM NETWORK EMULATOR
 M:     Stephen Hemminger <stephen@networkplumber.org>
 L:     netem@lists.linux-foundation.org (moderated for non-subscribers)
@@ -14755,6 +14747,13 @@ M:     Chris Boot <bootc@bootc.net>
 S:     Maintained
 F:     drivers/leds/leds-net48xx.c
 
+SOFT-IWARP DRIVER (siw)
+M:     Bernard Metzler <bmt@zurich.ibm.com>
+L:     linux-rdma@vger.kernel.org
+S:     Supported
+F:     drivers/infiniband/sw/siw/
+F:     include/uapi/rdma/siw-abi.h
+
 SOFT-ROCE DRIVER (rxe)
 M:     Moni Shoua <monis@mellanox.com>
 L:     linux-rdma@vger.kernel.org
index 8ba41cbf18697d1045bd0b9f04c1f7bccc528497..85e103b147cc387cdf1a9a96fa3bf97341d206a7 100644 (file)
@@ -7,6 +7,7 @@ menuconfig INFINIBAND
        depends on m || IPV6 != m
        depends on !ALPHA
        select IRQ_POLL
+       select DIMLIB
        ---help---
          Core support for InfiniBand (IB).  Make sure to also select
          any protocols you wish to use as well as drivers for your
@@ -36,17 +37,6 @@ config INFINIBAND_USER_ACCESS
          libibverbs, libibcm and a hardware driver library from
          rdma-core <https://github.com/linux-rdma/rdma-core>.
 
-config INFINIBAND_USER_ACCESS_UCM
-       tristate "Userspace CM (UCM, DEPRECATED)"
-       depends on BROKEN || COMPILE_TEST
-       depends on INFINIBAND_USER_ACCESS
-       help
-         The UCM module has known security flaws, which no one is
-         interested to fix. The user-space part of this code was
-         dropped from the upstream a long time ago.
-
-         This option is DEPRECATED and planned to be removed.
-
 config INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI
        bool "Allow experimental legacy verbs in new ioctl uAPI  (EXPERIMENTAL)"
        depends on INFINIBAND_USER_ACCESS
@@ -98,7 +88,6 @@ source "drivers/infiniband/hw/efa/Kconfig"
 source "drivers/infiniband/hw/i40iw/Kconfig"
 source "drivers/infiniband/hw/mlx4/Kconfig"
 source "drivers/infiniband/hw/mlx5/Kconfig"
-source "drivers/infiniband/hw/nes/Kconfig"
 source "drivers/infiniband/hw/ocrdma/Kconfig"
 source "drivers/infiniband/hw/vmw_pvrdma/Kconfig"
 source "drivers/infiniband/hw/usnic/Kconfig"
@@ -108,6 +97,7 @@ source "drivers/infiniband/hw/hfi1/Kconfig"
 source "drivers/infiniband/hw/qedr/Kconfig"
 source "drivers/infiniband/sw/rdmavt/Kconfig"
 source "drivers/infiniband/sw/rxe/Kconfig"
+source "drivers/infiniband/sw/siw/Kconfig"
 endif
 
 source "drivers/infiniband/ulp/ipoib/Kconfig"
index 313f2349b518430c27cce117a9c36d3a304387e0..09881bd5f12dd030df685002d69c4731bf18abb7 100644 (file)
@@ -6,13 +6,12 @@ obj-$(CONFIG_INFINIBAND) +=           ib_core.o ib_cm.o iw_cm.o \
                                        $(infiniband-y)
 obj-$(CONFIG_INFINIBAND_USER_MAD) +=   ib_umad.o
 obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o $(user_access-y)
-obj-$(CONFIG_INFINIBAND_USER_ACCESS_UCM) += ib_ucm.o $(user_access-y)
 
 ib_core-y :=                   packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
                                device.o fmr_pool.o cache.o netlink.o \
                                roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
                                multicast.o mad.o smi.o agent.o mad_rmpp.o \
-                               nldev.o restrack.o
+                               nldev.o restrack.o counters.o
 
 ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
 ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
@@ -29,8 +28,6 @@ rdma_ucm-y :=                 ucma.o
 
 ib_umad-y :=                   user_mad.o
 
-ib_ucm-y :=                    ucm.o
-
 ib_uverbs-y :=                 uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
                                rdma_core.o uverbs_std_types.o uverbs_ioctl.o \
                                uverbs_std_types_cq.o \
index 2f7d14159841f833e26a1974c4e14b5be15f7677..9b76a8fcdd2479bc696184eee6848fde6de8d994 100644 (file)
@@ -337,7 +337,7 @@ static int dst_fetch_ha(const struct dst_entry *dst,
                neigh_event_send(n, NULL);
                ret = -ENODATA;
        } else {
-               memcpy(dev_addr->dst_dev_addr, n->ha, MAX_ADDR_LEN);
+               neigh_ha_snapshot(dev_addr->dst_dev_addr, n, dst->dev);
        }
 
        neigh_release(n);
index ff40a450b5d28ed56b38eb590e7a08084a70ce49..888d89ce81df07118fd21683241921a1499e6aea 100644 (file)
@@ -60,6 +60,7 @@ extern bool ib_devices_shared_netns;
 int ib_device_register_sysfs(struct ib_device *device);
 void ib_device_unregister_sysfs(struct ib_device *device);
 int ib_device_rename(struct ib_device *ibdev, const char *name);
+int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim);
 
 typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
              struct net_device *idev, void *cookie);
@@ -88,6 +89,15 @@ typedef int (*nldev_callback)(struct ib_device *device,
 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
                     struct netlink_callback *cb);
 
+struct ib_client_nl_info {
+       struct sk_buff *nl_msg;
+       struct device *cdev;
+       unsigned int port;
+       u64 abi;
+};
+int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name,
+                         struct ib_client_nl_info *res);
+
 enum ib_cache_gid_default_mode {
        IB_CACHE_GID_DEFAULT_MODE_SET,
        IB_CACHE_GID_DEFAULT_MODE_DELETE
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
new file mode 100644 (file)
index 0000000..01faef7
--- /dev/null
@@ -0,0 +1,634 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019 Mellanox Technologies. All rights reserved.
+ */
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_counter.h>
+
+#include "core_priv.h"
+#include "restrack.h"
+
+#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE)
+
+static int __counter_set_mode(struct rdma_counter_mode *curr,
+                             enum rdma_nl_counter_mode new_mode,
+                             enum rdma_nl_counter_mask new_mask)
+{
+       if ((new_mode == RDMA_COUNTER_MODE_AUTO) &&
+           ((new_mask & (~ALL_AUTO_MODE_MASKS)) ||
+            (curr->mode != RDMA_COUNTER_MODE_NONE)))
+               return -EINVAL;
+
+       curr->mode = new_mode;
+       curr->mask = new_mask;
+       return 0;
+}
+
+/**
+ * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode
+ *
+ * When @on is true, the @mask must be set; When @on is false, it goes
+ * into manual mode if there's any counter, so that the user is able to
+ * manually access them.
+ */
+int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
+                              bool on, enum rdma_nl_counter_mask mask)
+{
+       struct rdma_port_counter *port_counter;
+       int ret;
+
+       port_counter = &dev->port_data[port].port_counter;
+       mutex_lock(&port_counter->lock);
+       if (on) {
+               ret = __counter_set_mode(&port_counter->mode,
+                                        RDMA_COUNTER_MODE_AUTO, mask);
+       } else {
+               if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               if (port_counter->num_counters)
+                       ret = __counter_set_mode(&port_counter->mode,
+                                                RDMA_COUNTER_MODE_MANUAL, 0);
+               else
+                       ret = __counter_set_mode(&port_counter->mode,
+                                                RDMA_COUNTER_MODE_NONE, 0);
+       }
+
+out:
+       mutex_unlock(&port_counter->lock);
+       return ret;
+}
+
+static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port,
+                                              enum rdma_nl_counter_mode mode)
+{
+       struct rdma_port_counter *port_counter;
+       struct rdma_counter *counter;
+       int ret;
+
+       if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats)
+               return NULL;
+
+       counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+       if (!counter)
+               return NULL;
+
+       counter->device    = dev;
+       counter->port      = port;
+       counter->res.type  = RDMA_RESTRACK_COUNTER;
+       counter->stats     = dev->ops.counter_alloc_stats(counter);
+       if (!counter->stats)
+               goto err_stats;
+
+       port_counter = &dev->port_data[port].port_counter;
+       mutex_lock(&port_counter->lock);
+       if (mode == RDMA_COUNTER_MODE_MANUAL) {
+               ret = __counter_set_mode(&port_counter->mode,
+                                        RDMA_COUNTER_MODE_MANUAL, 0);
+               if (ret)
+                       goto err_mode;
+       }
+
+       port_counter->num_counters++;
+       mutex_unlock(&port_counter->lock);
+
+       counter->mode.mode = mode;
+       kref_init(&counter->kref);
+       mutex_init(&counter->lock);
+
+       return counter;
+
+err_mode:
+       mutex_unlock(&port_counter->lock);
+       kfree(counter->stats);
+err_stats:
+       kfree(counter);
+       return NULL;
+}
+
+static void rdma_counter_free(struct rdma_counter *counter)
+{
+       struct rdma_port_counter *port_counter;
+
+       port_counter = &counter->device->port_data[counter->port].port_counter;
+       mutex_lock(&port_counter->lock);
+       port_counter->num_counters--;
+       if (!port_counter->num_counters &&
+           (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL))
+               __counter_set_mode(&port_counter->mode, RDMA_COUNTER_MODE_NONE,
+                                  0);
+
+       mutex_unlock(&port_counter->lock);
+
+       rdma_restrack_del(&counter->res);
+       kfree(counter->stats);
+       kfree(counter);
+}
+
+static void auto_mode_init_counter(struct rdma_counter *counter,
+                                  const struct ib_qp *qp,
+                                  enum rdma_nl_counter_mask new_mask)
+{
+       struct auto_mode_param *param = &counter->mode.param;
+
+       counter->mode.mode = RDMA_COUNTER_MODE_AUTO;
+       counter->mode.mask = new_mask;
+
+       if (new_mask & RDMA_COUNTER_MASK_QP_TYPE)
+               param->qp_type = qp->qp_type;
+}
+
+static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter,
+                           enum rdma_nl_counter_mask auto_mask)
+{
+       struct auto_mode_param *param = &counter->mode.param;
+       bool match = true;
+
+       if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res))
+               return false;
+
+       /* Ensure that counter belong to right PID */
+       if (!rdma_is_kernel_res(&counter->res) &&
+           !rdma_is_kernel_res(&qp->res) &&
+           (task_pid_vnr(counter->res.task) != current->pid))
+               return false;
+
+       if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE)
+               match &= (param->qp_type == qp->qp_type);
+
+       return match;
+}
+
+static int __rdma_counter_bind_qp(struct rdma_counter *counter,
+                                 struct ib_qp *qp)
+{
+       int ret;
+
+       if (qp->counter)
+               return -EINVAL;
+
+       if (!qp->device->ops.counter_bind_qp)
+               return -EOPNOTSUPP;
+
+       mutex_lock(&counter->lock);
+       ret = qp->device->ops.counter_bind_qp(counter, qp);
+       mutex_unlock(&counter->lock);
+
+       return ret;
+}
+
+static int __rdma_counter_unbind_qp(struct ib_qp *qp)
+{
+       struct rdma_counter *counter = qp->counter;
+       int ret;
+
+       if (!qp->device->ops.counter_unbind_qp)
+               return -EOPNOTSUPP;
+
+       mutex_lock(&counter->lock);
+       ret = qp->device->ops.counter_unbind_qp(qp);
+       mutex_unlock(&counter->lock);
+
+       return ret;
+}
+
+static void counter_history_stat_update(const struct rdma_counter *counter)
+{
+       struct ib_device *dev = counter->device;
+       struct rdma_port_counter *port_counter;
+       int i;
+
+       port_counter = &dev->port_data[counter->port].port_counter;
+       if (!port_counter->hstats)
+               return;
+
+       for (i = 0; i < counter->stats->num_counters; i++)
+               port_counter->hstats->value[i] += counter->stats->value[i];
+}
+
+/**
+ * rdma_get_counter_auto_mode - Find the counter that @qp should be bound
+ *     with in auto mode
+ *
+ * Return: The counter (with ref-count increased) if found
+ */
+static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp,
+                                                      u8 port)
+{
+       struct rdma_port_counter *port_counter;
+       struct rdma_counter *counter = NULL;
+       struct ib_device *dev = qp->device;
+       struct rdma_restrack_entry *res;
+       struct rdma_restrack_root *rt;
+       unsigned long id = 0;
+
+       port_counter = &dev->port_data[port].port_counter;
+       rt = &dev->res[RDMA_RESTRACK_COUNTER];
+       xa_lock(&rt->xa);
+       xa_for_each(&rt->xa, id, res) {
+               if (!rdma_is_visible_in_pid_ns(res))
+                       continue;
+
+               counter = container_of(res, struct rdma_counter, res);
+               if ((counter->device != qp->device) || (counter->port != port))
+                       goto next;
+
+               if (auto_mode_match(qp, counter, port_counter->mode.mask))
+                       break;
+next:
+               counter = NULL;
+       }
+
+       if (counter && !kref_get_unless_zero(&counter->kref))
+               counter = NULL;
+
+       xa_unlock(&rt->xa);
+       return counter;
+}
+
+static void rdma_counter_res_add(struct rdma_counter *counter,
+                                struct ib_qp *qp)
+{
+       if (rdma_is_kernel_res(&qp->res)) {
+               rdma_restrack_set_task(&counter->res, qp->res.kern_name);
+               rdma_restrack_kadd(&counter->res);
+       } else {
+               rdma_restrack_attach_task(&counter->res, qp->res.task);
+               rdma_restrack_uadd(&counter->res);
+       }
+}
+
+static void counter_release(struct kref *kref)
+{
+       struct rdma_counter *counter;
+
+       counter = container_of(kref, struct rdma_counter, kref);
+       counter_history_stat_update(counter);
+       counter->device->ops.counter_dealloc(counter);
+       rdma_counter_free(counter);
+}
+
+/**
+ * rdma_counter_bind_qp_auto - Check and bind the QP to a counter base on
+ *   the auto-mode rule
+ */
+int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port)
+{
+       struct rdma_port_counter *port_counter;
+       struct ib_device *dev = qp->device;
+       struct rdma_counter *counter;
+       int ret;
+
+       if (!rdma_is_port_valid(dev, port))
+               return -EINVAL;
+
+       port_counter = &dev->port_data[port].port_counter;
+       if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO)
+               return 0;
+
+       counter = rdma_get_counter_auto_mode(qp, port);
+       if (counter) {
+               ret = __rdma_counter_bind_qp(counter, qp);
+               if (ret) {
+                       kref_put(&counter->kref, counter_release);
+                       return ret;
+               }
+       } else {
+               counter = rdma_counter_alloc(dev, port, RDMA_COUNTER_MODE_AUTO);
+               if (!counter)
+                       return -ENOMEM;
+
+               auto_mode_init_counter(counter, qp, port_counter->mode.mask);
+
+               ret = __rdma_counter_bind_qp(counter, qp);
+               if (ret) {
+                       rdma_counter_free(counter);
+                       return ret;
+               }
+
+               rdma_counter_res_add(counter, qp);
+       }
+
+       return 0;
+}
+
+/**
+ * rdma_counter_unbind_qp - Unbind a qp from a counter
+ * @force:
+ *   true - Decrease the counter ref-count anyway (e.g., qp destroy)
+ */
+int rdma_counter_unbind_qp(struct ib_qp *qp, bool force)
+{
+       struct rdma_counter *counter = qp->counter;
+       int ret;
+
+       if (!counter)
+               return -EINVAL;
+
+       ret = __rdma_counter_unbind_qp(qp);
+       if (ret && !force)
+               return ret;
+
+       kref_put(&counter->kref, counter_release);
+       return 0;
+}
+
+int rdma_counter_query_stats(struct rdma_counter *counter)
+{
+       struct ib_device *dev = counter->device;
+       int ret;
+
+       if (!dev->ops.counter_update_stats)
+               return -EINVAL;
+
+       mutex_lock(&counter->lock);
+       ret = dev->ops.counter_update_stats(counter);
+       mutex_unlock(&counter->lock);
+
+       return ret;
+}
+
+static u64 get_running_counters_hwstat_sum(struct ib_device *dev,
+                                          u8 port, u32 index)
+{
+       struct rdma_restrack_entry *res;
+       struct rdma_restrack_root *rt;
+       struct rdma_counter *counter;
+       unsigned long id = 0;
+       u64 sum = 0;
+
+       rt = &dev->res[RDMA_RESTRACK_COUNTER];
+       xa_lock(&rt->xa);
+       xa_for_each(&rt->xa, id, res) {
+               if (!rdma_restrack_get(res))
+                       continue;
+
+               xa_unlock(&rt->xa);
+
+               counter = container_of(res, struct rdma_counter, res);
+               if ((counter->device != dev) || (counter->port != port) ||
+                   rdma_counter_query_stats(counter))
+                       goto next;
+
+               sum += counter->stats->value[index];
+
+next:
+               xa_lock(&rt->xa);
+               rdma_restrack_put(res);
+       }
+
+       xa_unlock(&rt->xa);
+       return sum;
+}
+
+/**
+ * rdma_counter_get_hwstat_value() - Get the sum value of all counters on a
+ *   specific port, including the running ones and history data
+ */
+u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index)
+{
+       struct rdma_port_counter *port_counter;
+       u64 sum;
+
+       port_counter = &dev->port_data[port].port_counter;
+       sum = get_running_counters_hwstat_sum(dev, port, index);
+       sum += port_counter->hstats->value[index];
+
+       return sum;
+}
+
+static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num)
+{
+       struct rdma_restrack_entry *res = NULL;
+       struct ib_qp *qp = NULL;
+
+       res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_QP, qp_num);
+       if (IS_ERR(res))
+               return NULL;
+
+       if (!rdma_is_visible_in_pid_ns(res))
+               goto err;
+
+       qp = container_of(res, struct ib_qp, res);
+       if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+               goto err;
+
+       return qp;
+
+err:
+       rdma_restrack_put(&qp->res);
+       return NULL;
+}
+
+static int rdma_counter_bind_qp_manual(struct rdma_counter *counter,
+                                      struct ib_qp *qp)
+{
+       if ((counter->device != qp->device) || (counter->port != qp->port))
+               return -EINVAL;
+
+       return __rdma_counter_bind_qp(counter, qp);
+}
+
+static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev,
+                                                  u32 counter_id)
+{
+       struct rdma_restrack_entry *res;
+       struct rdma_counter *counter;
+
+       res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_COUNTER, counter_id);
+       if (IS_ERR(res))
+               return NULL;
+
+       if (!rdma_is_visible_in_pid_ns(res)) {
+               rdma_restrack_put(res);
+               return NULL;
+       }
+
+       counter = container_of(res, struct rdma_counter, res);
+       kref_get(&counter->kref);
+       rdma_restrack_put(res);
+
+       return counter;
+}
+
+/**
+ * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id
+ */
+int rdma_counter_bind_qpn(struct ib_device *dev, u8 port,
+                         u32 qp_num, u32 counter_id)
+{
+       struct rdma_counter *counter;
+       struct ib_qp *qp;
+       int ret;
+
+       qp = rdma_counter_get_qp(dev, qp_num);
+       if (!qp)
+               return -ENOENT;
+
+       counter = rdma_get_counter_by_id(dev, counter_id);
+       if (!counter) {
+               ret = -ENOENT;
+               goto err;
+       }
+
+       if (counter->res.task != qp->res.task) {
+               ret = -EINVAL;
+               goto err_task;
+       }
+
+       ret = rdma_counter_bind_qp_manual(counter, qp);
+       if (ret)
+               goto err_task;
+
+       rdma_restrack_put(&qp->res);
+       return 0;
+
+err_task:
+       kref_put(&counter->kref, counter_release);
+err:
+       rdma_restrack_put(&qp->res);
+       return ret;
+}
+
+/**
+ * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it
+ *   The id of new counter is returned in @counter_id
+ */
+int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port,
+                               u32 qp_num, u32 *counter_id)
+{
+       struct rdma_counter *counter;
+       struct ib_qp *qp;
+       int ret;
+
+       if (!rdma_is_port_valid(dev, port))
+               return -EINVAL;
+
+       qp = rdma_counter_get_qp(dev, qp_num);
+       if (!qp)
+               return -ENOENT;
+
+       if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       counter = rdma_counter_alloc(dev, port, RDMA_COUNTER_MODE_MANUAL);
+       if (!counter) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       ret = rdma_counter_bind_qp_manual(counter, qp);
+       if (ret)
+               goto err_bind;
+
+       if (counter_id)
+               *counter_id = counter->id;
+
+       rdma_counter_res_add(counter, qp);
+
+       rdma_restrack_put(&qp->res);
+       return ret;
+
+err_bind:
+       rdma_counter_free(counter);
+err:
+       rdma_restrack_put(&qp->res);
+       return ret;
+}
+
+/**
+ * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter
+ */
+int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port,
+                           u32 qp_num, u32 counter_id)
+{
+       struct rdma_port_counter *port_counter;
+       struct ib_qp *qp;
+       int ret;
+
+       if (!rdma_is_port_valid(dev, port))
+               return -EINVAL;
+
+       qp = rdma_counter_get_qp(dev, qp_num);
+       if (!qp)
+               return -ENOENT;
+
+       if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       port_counter = &dev->port_data[port].port_counter;
+       if (!qp->counter || qp->counter->id != counter_id ||
+           port_counter->mode.mode != RDMA_COUNTER_MODE_MANUAL) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = rdma_counter_unbind_qp(qp, false);
+
+out:
+       rdma_restrack_put(&qp->res);
+       return ret;
+}
+
+int rdma_counter_get_mode(struct ib_device *dev, u8 port,
+                         enum rdma_nl_counter_mode *mode,
+                         enum rdma_nl_counter_mask *mask)
+{
+       struct rdma_port_counter *port_counter;
+
+       port_counter = &dev->port_data[port].port_counter;
+       *mode = port_counter->mode.mode;
+       *mask = port_counter->mode.mask;
+
+       return 0;
+}
+
+void rdma_counter_init(struct ib_device *dev)
+{
+       struct rdma_port_counter *port_counter;
+       u32 port;
+
+       if (!dev->ops.alloc_hw_stats || !dev->port_data)
+               return;
+
+       rdma_for_each_port(dev, port) {
+               port_counter = &dev->port_data[port].port_counter;
+               port_counter->mode.mode = RDMA_COUNTER_MODE_NONE;
+               mutex_init(&port_counter->lock);
+
+               port_counter->hstats = dev->ops.alloc_hw_stats(dev, port);
+               if (!port_counter->hstats)
+                       goto fail;
+       }
+
+       return;
+
+fail:
+       rdma_for_each_port(dev, port) {
+               port_counter = &dev->port_data[port].port_counter;
+               kfree(port_counter->hstats);
+               port_counter->hstats = NULL;
+       }
+
+       return;
+}
+
+void rdma_counter_release(struct ib_device *dev)
+{
+       struct rdma_port_counter *port_counter;
+       u32 port;
+
+       if (!dev->ops.alloc_hw_stats)
+               return;
+
+       rdma_for_each_port(dev, port) {
+               port_counter = &dev->port_data[port].port_counter;
+               kfree(port_counter->hstats);
+       }
+}
index a24c900fbdf606ad0b4504b56fdcdfd9577b8a5f..7c599878ccf711e22d70771c67a1b093f101670f 100644 (file)
 #define IB_POLL_FLAGS \
        (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
 
+static const struct dim_cq_moder
+rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
+       {1,   0, 1,  0},
+       {1,   0, 4,  0},
+       {2,   0, 4,  0},
+       {2,   0, 8,  0},
+       {4,   0, 8,  0},
+       {16,  0, 8,  0},
+       {16,  0, 16, 0},
+       {32,  0, 16, 0},
+       {32,  0, 32, 0},
+};
+
+static void ib_cq_rdma_dim_work(struct work_struct *w)
+{
+       struct dim *dim = container_of(w, struct dim, work);
+       struct ib_cq *cq = dim->priv;
+
+       u16 usec = rdma_dim_prof[dim->profile_ix].usec;
+       u16 comps = rdma_dim_prof[dim->profile_ix].comps;
+
+       dim->state = DIM_START_MEASURE;
+
+       cq->device->ops.modify_cq(cq, comps, usec);
+}
+
+static void rdma_dim_init(struct ib_cq *cq)
+{
+       struct dim *dim;
+
+       if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim ||
+           cq->poll_ctx == IB_POLL_DIRECT)
+               return;
+
+       dim = kzalloc(sizeof(struct dim), GFP_KERNEL);
+       if (!dim)
+               return;
+
+       dim->state = DIM_START_MEASURE;
+       dim->tune_state = DIM_GOING_RIGHT;
+       dim->profile_ix = RDMA_DIM_START_PROFILE;
+       dim->priv = cq;
+       cq->dim = dim;
+
+       INIT_WORK(&dim->work, ib_cq_rdma_dim_work);
+}
+
 static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
                           int batch)
 {
@@ -78,6 +125,7 @@ static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
 static int ib_poll_handler(struct irq_poll *iop, int budget)
 {
        struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+       struct dim *dim = cq->dim;
        int completed;
 
        completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
@@ -87,6 +135,9 @@ static int ib_poll_handler(struct irq_poll *iop, int budget)
                        irq_poll_sched(&cq->iop);
        }
 
+       if (dim)
+               rdma_dim(dim, completed);
+
        return completed;
 }
 
@@ -105,6 +156,8 @@ static void ib_cq_poll_work(struct work_struct *work)
        if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
            ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
                queue_work(cq->comp_wq, &cq->work);
+       else if (cq->dim)
+               rdma_dim(cq->dim, completed);
 }
 
 static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
@@ -113,7 +166,7 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
 }
 
 /**
- * __ib_alloc_cq - allocate a completion queue
+ * __ib_alloc_cq_user - allocate a completion queue
  * @dev:               device to allocate the CQ for
  * @private:           driver private data, accessible from cq->cq_context
  * @nr_cqe:            number of CQEs to allocate
@@ -139,25 +192,30 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
        struct ib_cq *cq;
        int ret = -ENOMEM;
 
-       cq = dev->ops.create_cq(dev, &cq_attr, NULL);
-       if (IS_ERR(cq))
-               return cq;
+       cq = rdma_zalloc_drv_obj(dev, ib_cq);
+       if (!cq)
+               return ERR_PTR(ret);
 
        cq->device = dev;
-       cq->uobject = NULL;
-       cq->event_handler = NULL;
        cq->cq_context = private;
        cq->poll_ctx = poll_ctx;
        atomic_set(&cq->usecnt, 0);
 
        cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
        if (!cq->wc)
-               goto out_destroy_cq;
+               goto out_free_cq;
 
        cq->res.type = RDMA_RESTRACK_CQ;
        rdma_restrack_set_task(&cq->res, caller);
+
+       ret = dev->ops.create_cq(cq, &cq_attr, NULL);
+       if (ret)
+               goto out_free_wc;
+
        rdma_restrack_kadd(&cq->res);
 
+       rdma_dim_init(cq);
+
        switch (cq->poll_ctx) {
        case IB_POLL_DIRECT:
                cq->comp_handler = ib_cq_completion_direct;
@@ -178,29 +236,29 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
                break;
        default:
                ret = -EINVAL;
-               goto out_free_wc;
+               goto out_destroy_cq;
        }
 
        return cq;
 
-out_free_wc:
-       kfree(cq->wc);
-       rdma_restrack_del(&cq->res);
 out_destroy_cq:
+       rdma_restrack_del(&cq->res);
        cq->device->ops.destroy_cq(cq, udata);
+out_free_wc:
+       kfree(cq->wc);
+out_free_cq:
+       kfree(cq);
        return ERR_PTR(ret);
 }
 EXPORT_SYMBOL(__ib_alloc_cq_user);
 
 /**
- * ib_free_cq - free a completion queue
+ * ib_free_cq_user - free a completion queue
  * @cq:                completion queue to free.
  * @udata:     User data or NULL for kernel object
  */
 void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
 {
-       int ret;
-
        if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
                return;
 
@@ -218,9 +276,12 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
                WARN_ON_ONCE(1);
        }
 
-       kfree(cq->wc);
        rdma_restrack_del(&cq->res);
-       ret = cq->device->ops.destroy_cq(cq, udata);
-       WARN_ON_ONCE(ret);
+       cq->device->ops.destroy_cq(cq, udata);
+       if (cq->dim)
+               cancel_work_sync(&cq->dim->work);
+       kfree(cq->dim);
+       kfree(cq->wc);
+       kfree(cq);
 }
 EXPORT_SYMBOL(ib_free_cq_user);
index 3352a107b4a36756518087eef46615b55372e2a0..9773145dee0996d0d058230bc6ce18f9c138d34f 100644 (file)
@@ -46,6 +46,7 @@
 #include <rdma/rdma_netlink.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
+#include <rdma/rdma_counter.h>
 
 #include "core_priv.h"
 #include "restrack.h"
@@ -270,7 +271,7 @@ struct ib_port_data_rcu {
        struct ib_port_data pdata[];
 };
 
-static int ib_device_check_mandatory(struct ib_device *device)
+static void ib_device_check_mandatory(struct ib_device *device)
 {
 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x }
        static const struct {
@@ -305,8 +306,6 @@ static int ib_device_check_mandatory(struct ib_device *device)
                        break;
                }
        }
-
-       return 0;
 }
 
 /*
@@ -375,7 +374,7 @@ struct ib_device *ib_device_get_by_name(const char *name,
        down_read(&devices_rwsem);
        device = __ib_device_get_by_name(name);
        if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
-           device->driver_id != driver_id)
+           device->ops.driver_id != driver_id)
                device = NULL;
 
        if (device) {
@@ -449,6 +448,15 @@ int ib_device_rename(struct ib_device *ibdev, const char *name)
        return 0;
 }
 
+int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim)
+{
+       if (use_dim > 1)
+               return -EINVAL;
+       ibdev->use_cq_dim = use_dim;
+
+       return 0;
+}
+
 static int alloc_name(struct ib_device *ibdev, const char *name)
 {
        struct ib_device *device;
@@ -494,10 +502,12 @@ static void ib_device_release(struct device *device)
        if (dev->port_data) {
                ib_cache_release_one(dev);
                ib_security_release_port_pkey_list(dev);
+               rdma_counter_release(dev);
                kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
                                       pdata[0]),
                          rcu_head);
        }
+
        xa_destroy(&dev->compat_devs);
        xa_destroy(&dev->client_data);
        kfree_rcu(dev, rcu_head);
@@ -1193,10 +1203,7 @@ static int setup_device(struct ib_device *device)
        int ret;
 
        setup_dma_device(device);
-
-       ret = ib_device_check_mandatory(device);
-       if (ret)
-               return ret;
+       ib_device_check_mandatory(device);
 
        ret = setup_port_data(device);
        if (ret) {
@@ -1321,6 +1328,8 @@ int ib_register_device(struct ib_device *device, const char *name)
 
        ib_device_register_rdmacg(device);
 
+       rdma_counter_init(device);
+
        /*
         * Ensure that ADD uevent is not fired because it
         * is too early amd device is not initialized yet.
@@ -1479,7 +1488,7 @@ void ib_unregister_driver(enum rdma_driver_id driver_id)
 
        down_read(&devices_rwsem);
        xa_for_each (&devices, index, ib_dev) {
-               if (ib_dev->driver_id != driver_id)
+               if (ib_dev->ops.driver_id != driver_id)
                        continue;
 
                get_device(&ib_dev->dev);
@@ -1749,6 +1758,104 @@ void ib_unregister_client(struct ib_client *client)
 }
 EXPORT_SYMBOL(ib_unregister_client);
 
+static int __ib_get_global_client_nl_info(const char *client_name,
+                                         struct ib_client_nl_info *res)
+{
+       struct ib_client *client;
+       unsigned long index;
+       int ret = -ENOENT;
+
+       down_read(&clients_rwsem);
+       xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
+               if (strcmp(client->name, client_name) != 0)
+                       continue;
+               if (!client->get_global_nl_info) {
+                       ret = -EOPNOTSUPP;
+                       break;
+               }
+               ret = client->get_global_nl_info(res);
+               if (WARN_ON(ret == -ENOENT))
+                       ret = -EINVAL;
+               if (!ret && res->cdev)
+                       get_device(res->cdev);
+               break;
+       }
+       up_read(&clients_rwsem);
+       return ret;
+}
+
+static int __ib_get_client_nl_info(struct ib_device *ibdev,
+                                  const char *client_name,
+                                  struct ib_client_nl_info *res)
+{
+       unsigned long index;
+       void *client_data;
+       int ret = -ENOENT;
+
+       down_read(&ibdev->client_data_rwsem);
+       xan_for_each_marked (&ibdev->client_data, index, client_data,
+                            CLIENT_DATA_REGISTERED) {
+               struct ib_client *client = xa_load(&clients, index);
+
+               if (!client || strcmp(client->name, client_name) != 0)
+                       continue;
+               if (!client->get_nl_info) {
+                       ret = -EOPNOTSUPP;
+                       break;
+               }
+               ret = client->get_nl_info(ibdev, client_data, res);
+               if (WARN_ON(ret == -ENOENT))
+                       ret = -EINVAL;
+
+               /*
+                * The cdev is guaranteed valid as long as we are inside the
+                * client_data_rwsem as remove_one can't be called. Keep it
+                * valid for the caller.
+                */
+               if (!ret && res->cdev)
+                       get_device(res->cdev);
+               break;
+       }
+       up_read(&ibdev->client_data_rwsem);
+
+       return ret;
+}
+
+/**
+ * ib_get_client_nl_info - Fetch the nl_info from a client
+ * @device - IB device
+ * @client_name - Name of the client
+ * @res - Result of the query
+ */
+int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name,
+                         struct ib_client_nl_info *res)
+{
+       int ret;
+
+       if (ibdev)
+               ret = __ib_get_client_nl_info(ibdev, client_name, res);
+       else
+               ret = __ib_get_global_client_nl_info(client_name, res);
+#ifdef CONFIG_MODULES
+       if (ret == -ENOENT) {
+               request_module("rdma-client-%s", client_name);
+               if (ibdev)
+                       ret = __ib_get_client_nl_info(ibdev, client_name, res);
+               else
+                       ret = __ib_get_global_client_nl_info(client_name, res);
+       }
+#endif
+       if (ret) {
+               if (ret == -ENOENT)
+                       return -EOPNOTSUPP;
+               return ret;
+       }
+
+       if (WARN_ON(!res->cdev))
+               return -EINVAL;
+       return 0;
+}
+
 /**
  * ib_set_client_data - Set IB client context
  * @device:Device to set context for
@@ -2039,7 +2146,7 @@ struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
                                    (uintptr_t)ndev) {
                if (rcu_access_pointer(cur->netdev) == ndev &&
                    (driver_id == RDMA_DRIVER_UNKNOWN ||
-                    cur->ib_dev->driver_id == driver_id) &&
+                    cur->ib_dev->ops.driver_id == driver_id) &&
                    ib_device_try_get(cur->ib_dev)) {
                        res = cur->ib_dev;
                        break;
@@ -2344,12 +2451,28 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 
 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name)
 
+       if (ops->driver_id != RDMA_DRIVER_UNKNOWN) {
+               WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN &&
+                       dev_ops->driver_id != ops->driver_id);
+               dev_ops->driver_id = ops->driver_id;
+       }
+       if (ops->owner) {
+               WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner);
+               dev_ops->owner = ops->owner;
+       }
+       if (ops->uverbs_abi_ver)
+               dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver;
+
+       dev_ops->uverbs_no_driver_id_binding |=
+               ops->uverbs_no_driver_id_binding;
+
        SET_DEVICE_OP(dev_ops, add_gid);
        SET_DEVICE_OP(dev_ops, advise_mr);
        SET_DEVICE_OP(dev_ops, alloc_dm);
        SET_DEVICE_OP(dev_ops, alloc_fmr);
        SET_DEVICE_OP(dev_ops, alloc_hw_stats);
        SET_DEVICE_OP(dev_ops, alloc_mr);
+       SET_DEVICE_OP(dev_ops, alloc_mr_integrity);
        SET_DEVICE_OP(dev_ops, alloc_mw);
        SET_DEVICE_OP(dev_ops, alloc_pd);
        SET_DEVICE_OP(dev_ops, alloc_rdma_netdev);
@@ -2357,6 +2480,11 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
        SET_DEVICE_OP(dev_ops, alloc_xrcd);
        SET_DEVICE_OP(dev_ops, attach_mcast);
        SET_DEVICE_OP(dev_ops, check_mr_status);
+       SET_DEVICE_OP(dev_ops, counter_alloc_stats);
+       SET_DEVICE_OP(dev_ops, counter_bind_qp);
+       SET_DEVICE_OP(dev_ops, counter_dealloc);
+       SET_DEVICE_OP(dev_ops, counter_unbind_qp);
+       SET_DEVICE_OP(dev_ops, counter_update_stats);
        SET_DEVICE_OP(dev_ops, create_ah);
        SET_DEVICE_OP(dev_ops, create_counters);
        SET_DEVICE_OP(dev_ops, create_cq);
@@ -2409,6 +2537,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
        SET_DEVICE_OP(dev_ops, iw_reject);
        SET_DEVICE_OP(dev_ops, iw_rem_ref);
        SET_DEVICE_OP(dev_ops, map_mr_sg);
+       SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
        SET_DEVICE_OP(dev_ops, map_phys_fmr);
        SET_DEVICE_OP(dev_ops, mmap);
        SET_DEVICE_OP(dev_ops, modify_ah);
@@ -2445,6 +2574,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
        SET_DEVICE_OP(dev_ops, unmap_fmr);
 
        SET_OBJ_SIZE(dev_ops, ib_ah);
+       SET_OBJ_SIZE(dev_ops, ib_cq);
        SET_OBJ_SIZE(dev_ops, ib_pd);
        SET_OBJ_SIZE(dev_ops, ib_srq);
        SET_OBJ_SIZE(dev_ops, ib_ucontext);
index d117f21ce9fdeedb5ce5e5bfb62c51fd83661092..c0e2df128b3449948fd9d0d7a80c158b1cd1c887 100644 (file)
@@ -34,14 +34,18 @@ void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr)
 EXPORT_SYMBOL(ib_mr_pool_put);
 
 int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr,
-               enum ib_mr_type type, u32 max_num_sg)
+               enum ib_mr_type type, u32 max_num_sg, u32 max_num_meta_sg)
 {
        struct ib_mr *mr;
        unsigned long flags;
        int ret, i;
 
        for (i = 0; i < nr; i++) {
-               mr = ib_alloc_mr(qp->pd, type, max_num_sg);
+               if (type == IB_MR_TYPE_INTEGRITY)
+                       mr = ib_alloc_mr_integrity(qp->pd, max_num_sg,
+                                                  max_num_meta_sg);
+               else
+                       mr = ib_alloc_mr(qp->pd, type, max_num_sg);
                if (IS_ERR(mr)) {
                        ret = PTR_ERR(mr);
                        goto out;
index 69188cbbd99bd53ffbf4e49e738805dc240344fa..783e465e7c412988903088911fa655c4aa169346 100644 (file)
 #include "cma_priv.h"
 #include "restrack.h"
 
+/*
+ * Sort array elements by the netlink attribute name
+ */
 static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
-       [RDMA_NLDEV_ATTR_DEV_INDEX]     = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_DEV_NAME]      = { .type = NLA_NUL_STRING,
-                                           .len = IB_DEVICE_NAME_MAX - 1},
-       [RDMA_NLDEV_ATTR_PORT_INDEX]    = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_FW_VERSION]    = { .type = NLA_NUL_STRING,
-                                           .len = IB_FW_VERSION_NAME_MAX - 1},
-       [RDMA_NLDEV_ATTR_NODE_GUID]     = { .type = NLA_U64 },
-       [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 },
-       [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 },
-       [RDMA_NLDEV_ATTR_LID]           = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_SM_LID]        = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_LMC]           = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_PORT_STATE]    = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_RES_SUMMARY]   = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY]     = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME] = { .type = NLA_NUL_STRING,
-                                            .len = 16 },
-       [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR] = { .type = NLA_U64 },
-       [RDMA_NLDEV_ATTR_RES_QP]                = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_QP_ENTRY]          = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_LQPN]              = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_RQPN]              = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_RQ_PSN]            = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_SQ_PSN]            = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_RES_TYPE]              = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_RES_STATE]             = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_RES_PID]               = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_KERN_NAME]         = { .type = NLA_NUL_STRING,
-                                                   .len = TASK_COMM_LEN },
+       [RDMA_NLDEV_ATTR_CHARDEV]               = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_CHARDEV_ABI]           = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_CHARDEV_NAME]          = { .type = NLA_NUL_STRING,
+                                       .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+       [RDMA_NLDEV_ATTR_CHARDEV_TYPE]          = { .type = NLA_NUL_STRING,
+                                       .len = RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE },
+       [RDMA_NLDEV_ATTR_DEV_DIM]               = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_DEV_INDEX]             = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_DEV_NAME]              = { .type = NLA_NUL_STRING,
+                                       .len = IB_DEVICE_NAME_MAX },
+       [RDMA_NLDEV_ATTR_DEV_NODE_TYPE]         = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_DEV_PROTOCOL]          = { .type = NLA_NUL_STRING,
+                                       .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+       [RDMA_NLDEV_ATTR_DRIVER]                = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_DRIVER_ENTRY]          = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE]     = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_DRIVER_STRING]         = { .type = NLA_NUL_STRING,
+                                       .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+       [RDMA_NLDEV_ATTR_DRIVER_S32]            = { .type = NLA_S32 },
+       [RDMA_NLDEV_ATTR_DRIVER_S64]            = { .type = NLA_S64 },
+       [RDMA_NLDEV_ATTR_DRIVER_U32]            = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_DRIVER_U64]            = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_FW_VERSION]            = { .type = NLA_NUL_STRING,
+                                       .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+       [RDMA_NLDEV_ATTR_LID]                   = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_LINK_TYPE]             = { .type = NLA_NUL_STRING,
+                                       .len = IFNAMSIZ },
+       [RDMA_NLDEV_ATTR_LMC]                   = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_NDEV_INDEX]            = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_NDEV_NAME]             = { .type = NLA_NUL_STRING,
+                                       .len = IFNAMSIZ },
+       [RDMA_NLDEV_ATTR_NODE_GUID]             = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_PORT_INDEX]            = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_PORT_PHYS_STATE]       = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_PORT_STATE]            = { .type = NLA_U8 },
        [RDMA_NLDEV_ATTR_RES_CM_ID]             = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_CM_IDN]            = { .type = NLA_U32 },
        [RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY]       = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_PS]                = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_SRC_ADDR]  = {
-                       .len = sizeof(struct __kernel_sockaddr_storage) },
-       [RDMA_NLDEV_ATTR_RES_DST_ADDR]  = {
-                       .len = sizeof(struct __kernel_sockaddr_storage) },
        [RDMA_NLDEV_ATTR_RES_CQ]                = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_CQ_ENTRY]          = { .type = NLA_NESTED },
        [RDMA_NLDEV_ATTR_RES_CQE]               = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_USECNT]            = { .type = NLA_U64 },
-       [RDMA_NLDEV_ATTR_RES_POLL_CTX]          = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_RES_MR]                = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_MR_ENTRY]          = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_RKEY]              = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_LKEY]              = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_CQN]               = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_CQ_ENTRY]          = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_CTXN]              = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_DST_ADDR]          = {
+                       .len = sizeof(struct __kernel_sockaddr_storage) },
        [RDMA_NLDEV_ATTR_RES_IOVA]              = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_RES_KERN_NAME]         = { .type = NLA_NUL_STRING,
+                                       .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+       [RDMA_NLDEV_ATTR_RES_LKEY]              = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY]    = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_LQPN]              = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_MR]                = { .type = NLA_NESTED },
        [RDMA_NLDEV_ATTR_RES_MRLEN]             = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_RES_MRN]               = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_MR_ENTRY]          = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE]    = { .type = NLA_U8 },
        [RDMA_NLDEV_ATTR_RES_PD]                = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_PD_ENTRY]          = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY]    = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY] = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_NDEV_INDEX]            = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_NDEV_NAME]             = { .type = NLA_NUL_STRING,
-                                                   .len = IFNAMSIZ },
-       [RDMA_NLDEV_ATTR_DRIVER]                = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_DRIVER_ENTRY]          = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_DRIVER_STRING]         = { .type = NLA_NUL_STRING,
-                                   .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
-       [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE]     = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_DRIVER_S32]            = { .type = NLA_S32 },
-       [RDMA_NLDEV_ATTR_DRIVER_U32]            = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_DRIVER_S64]            = { .type = NLA_S64 },
-       [RDMA_NLDEV_ATTR_DRIVER_U64]            = { .type = NLA_U64 },
        [RDMA_NLDEV_ATTR_RES_PDN]               = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_CQN]               = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_MRN]               = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_CM_IDN]            = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_CTXN]              = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_LINK_TYPE]             = { .type = NLA_NUL_STRING,
-                                   .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
-       [RDMA_NLDEV_SYS_ATTR_NETNS_MODE]        = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_DEV_PROTOCOL]          = { .type = NLA_NUL_STRING,
-                                   .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
+       [RDMA_NLDEV_ATTR_RES_PD_ENTRY]          = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_PID]               = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_POLL_CTX]          = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_RES_PS]                = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_QP]                = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_QP_ENTRY]          = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_RKEY]              = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_RQPN]              = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_RQ_PSN]            = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_SQ_PSN]            = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_SRC_ADDR]          = {
+                       .len = sizeof(struct __kernel_sockaddr_storage) },
+       [RDMA_NLDEV_ATTR_RES_STATE]             = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_RES_SUMMARY]           = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY]     = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR]= { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING,
+                                       .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+       [RDMA_NLDEV_ATTR_RES_TYPE]              = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_USECNT]            = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_SM_LID]                = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_SUBNET_PREFIX]         = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]   = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_STAT_MODE]             = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_STAT_RES]              = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_STAT_COUNTER]          = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY]    = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_STAT_COUNTER_ID]       = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]       = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY]  = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME] = { .type = NLA_NUL_STRING },
+       [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE] = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID]        = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID]      = { .type = NLA_U32 },
        [RDMA_NLDEV_NET_NS_FD]                  = { .type = NLA_U32 },
+       [RDMA_NLDEV_SYS_ATTR_NETNS_MODE]        = { .type = NLA_U8 },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -232,6 +253,8 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
                return -EMSGSIZE;
        if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type))
                return -EMSGSIZE;
+       if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim))
+               return -EMSGSIZE;
 
        /*
         * Link type is determined on first port and mlx4 device
@@ -532,6 +555,9 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin,
            nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx))
                goto err;
 
+       if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL)))
+               goto err;
+
        if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id))
                goto err;
        if (!rdma_is_kernel_res(res) &&
@@ -623,6 +649,152 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin,
 err:   return -EMSGSIZE;
 }
 
+static int fill_stat_counter_mode(struct sk_buff *msg,
+                                 struct rdma_counter *counter)
+{
+       struct rdma_counter_mode *m = &counter->mode;
+
+       if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode))
+               return -EMSGSIZE;
+
+       if (m->mode == RDMA_COUNTER_MODE_AUTO)
+               if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) &&
+                   nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type))
+                       return -EMSGSIZE;
+
+       return 0;
+}
+
+static int fill_stat_counter_qp_entry(struct sk_buff *msg, u32 qpn)
+{
+       struct nlattr *entry_attr;
+
+       entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY);
+       if (!entry_attr)
+               return -EMSGSIZE;
+
+       if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn))
+               goto err;
+
+       nla_nest_end(msg, entry_attr);
+       return 0;
+
+err:
+       nla_nest_cancel(msg, entry_attr);
+       return -EMSGSIZE;
+}
+
+static int fill_stat_counter_qps(struct sk_buff *msg,
+                                struct rdma_counter *counter)
+{
+       struct rdma_restrack_entry *res;
+       struct rdma_restrack_root *rt;
+       struct nlattr *table_attr;
+       struct ib_qp *qp = NULL;
+       unsigned long id = 0;
+       int ret = 0;
+
+       table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP);
+
+       rt = &counter->device->res[RDMA_RESTRACK_QP];
+       xa_lock(&rt->xa);
+       xa_for_each(&rt->xa, id, res) {
+               if (!rdma_is_visible_in_pid_ns(res))
+                       continue;
+
+               qp = container_of(res, struct ib_qp, res);
+               if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+                       continue;
+
+               if (!qp->counter || (qp->counter->id != counter->id))
+                       continue;
+
+               ret = fill_stat_counter_qp_entry(msg, qp->qp_num);
+               if (ret)
+                       goto err;
+       }
+
+       xa_unlock(&rt->xa);
+       nla_nest_end(msg, table_attr);
+       return 0;
+
+err:
+       xa_unlock(&rt->xa);
+       nla_nest_cancel(msg, table_attr);
+       return ret;
+}
+
+static int fill_stat_hwcounter_entry(struct sk_buff *msg,
+                                    const char *name, u64 value)
+{
+       struct nlattr *entry_attr;
+
+       entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY);
+       if (!entry_attr)
+               return -EMSGSIZE;
+
+       if (nla_put_string(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME,
+                          name))
+               goto err;
+       if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE,
+                             value, RDMA_NLDEV_ATTR_PAD))
+               goto err;
+
+       nla_nest_end(msg, entry_attr);
+       return 0;
+
+err:
+       nla_nest_cancel(msg, entry_attr);
+       return -EMSGSIZE;
+}
+
+static int fill_stat_counter_hwcounters(struct sk_buff *msg,
+                                       struct rdma_counter *counter)
+{
+       struct rdma_hw_stats *st = counter->stats;
+       struct nlattr *table_attr;
+       int i;
+
+       table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS);
+       if (!table_attr)
+               return -EMSGSIZE;
+
+       for (i = 0; i < st->num_counters; i++)
+               if (fill_stat_hwcounter_entry(msg, st->names[i], st->value[i]))
+                       goto err;
+
+       nla_nest_end(msg, table_attr);
+       return 0;
+
+err:
+       nla_nest_cancel(msg, table_attr);
+       return -EMSGSIZE;
+}
+
+static int fill_res_counter_entry(struct sk_buff *msg, bool has_cap_net_admin,
+                                 struct rdma_restrack_entry *res,
+                                 uint32_t port)
+{
+       struct rdma_counter *counter =
+               container_of(res, struct rdma_counter, res);
+
+       if (port && port != counter->port)
+               return 0;
+
+       /* Dump it even query failed */
+       rdma_counter_query_stats(counter);
+
+       if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) ||
+           nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) ||
+           fill_res_name_pid(msg, &counter->res) ||
+           fill_stat_counter_mode(msg, counter) ||
+           fill_stat_counter_qps(msg, counter) ||
+           fill_stat_counter_hwcounters(msg, counter))
+               return -EMSGSIZE;
+
+       return 0;
+}
+
 static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
 {
@@ -704,6 +876,14 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
                goto put_done;
        }
 
+       if (tb[RDMA_NLDEV_ATTR_DEV_DIM]) {
+               u8 use_dim;
+
+               use_dim = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_DIM]);
+               err = ib_device_set_dim(device,  use_dim);
+               goto done;
+       }
+
 done:
        ib_device_put(device);
 put_done:
@@ -990,19 +1170,15 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
                .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY,
                .id = RDMA_NLDEV_ATTR_RES_PDN,
        },
+       [RDMA_RESTRACK_COUNTER] = {
+               .fill_res_func = fill_res_counter_entry,
+               .nldev_cmd = RDMA_NLDEV_CMD_STAT_GET,
+               .nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER,
+               .entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY,
+               .id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID,
+       },
 };
 
-static bool is_visible_in_pid_ns(struct rdma_restrack_entry *res)
-{
-       /*
-        * 1. Kern resources should be visible in init name space only
-        * 2. Present only resources visible in the current namespace
-        */
-       if (rdma_is_kernel_res(res))
-               return task_active_pid_ns(current) == &init_pid_ns;
-       return task_active_pid_ns(current) == task_active_pid_ns(res->task);
-}
-
 static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
                               struct netlink_ext_ack *extack,
                               enum rdma_restrack_type res_type)
@@ -1047,7 +1223,7 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
                goto err;
        }
 
-       if (!is_visible_in_pid_ns(res)) {
+       if (!rdma_is_visible_in_pid_ns(res)) {
                ret = -ENOENT;
                goto err_get;
        }
@@ -1159,7 +1335,7 @@ static int res_get_common_dumpit(struct sk_buff *skb,
         * objects.
         */
        xa_for_each(&rt->xa, id, res) {
-               if (!is_visible_in_pid_ns(res))
+               if (!rdma_is_visible_in_pid_ns(res))
                        continue;
 
                if (idx < start || !rdma_restrack_get(res))
@@ -1237,6 +1413,7 @@ RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID);
 RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ);
 RES_GET_FUNCS(pd, RDMA_RESTRACK_PD);
 RES_GET_FUNCS(mr, RDMA_RESTRACK_MR);
+RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER);
 
 static LIST_HEAD(link_ops);
 static DECLARE_RWSEM(link_ops_rwsem);
@@ -1299,7 +1476,7 @@ static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
        nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
                    sizeof(ndev_name));
 
-       ndev = dev_get_by_name(&init_net, ndev_name);
+       ndev = dev_get_by_name(sock_net(skb->sk), ndev_name);
        if (!ndev)
                return -ENODEV;
 
@@ -1347,6 +1524,90 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
        return 0;
 }
 
+static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh,
+                            struct netlink_ext_ack *extack)
+{
+       struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+       char client_name[RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE];
+       struct ib_client_nl_info data = {};
+       struct ib_device *ibdev = NULL;
+       struct sk_buff *msg;
+       u32 index;
+       int err;
+
+       err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy,
+                         extack);
+       if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE])
+               return -EINVAL;
+
+       nla_strlcpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE],
+                   sizeof(client_name));
+
+       if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) {
+               index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+               ibdev = ib_device_get_by_index(sock_net(skb->sk), index);
+               if (!ibdev)
+                       return -EINVAL;
+
+               if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+                       data.port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+                       if (!rdma_is_port_valid(ibdev, data.port)) {
+                               err = -EINVAL;
+                               goto out_put;
+                       }
+               } else {
+                       data.port = -1;
+               }
+       } else if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+               return -EINVAL;
+       }
+
+       msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+       if (!msg) {
+               err = -ENOMEM;
+               goto out_put;
+       }
+       nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+                       RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                        RDMA_NLDEV_CMD_GET_CHARDEV),
+                       0, 0);
+
+       data.nl_msg = msg;
+       err = ib_get_client_nl_info(ibdev, client_name, &data);
+       if (err)
+               goto out_nlmsg;
+
+       err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV,
+                               huge_encode_dev(data.cdev->devt),
+                               RDMA_NLDEV_ATTR_PAD);
+       if (err)
+               goto out_data;
+       err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV_ABI, data.abi,
+                               RDMA_NLDEV_ATTR_PAD);
+       if (err)
+               goto out_data;
+       if (nla_put_string(msg, RDMA_NLDEV_ATTR_CHARDEV_NAME,
+                          dev_name(data.cdev))) {
+               err = -EMSGSIZE;
+               goto out_data;
+       }
+
+       nlmsg_end(msg, nlh);
+       put_device(data.cdev);
+       if (ibdev)
+               ib_device_put(ibdev);
+       return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+out_data:
+       put_device(data.cdev);
+out_nlmsg:
+       nlmsg_free(msg);
+out_put:
+       if (ibdev)
+               ib_device_put(ibdev);
+       return err;
+}
+
 static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
                              struct netlink_ext_ack *extack)
 {
@@ -1399,11 +1660,375 @@ static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
        return err;
 }
 
+static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+                              struct netlink_ext_ack *extack)
+{
+       u32 index, port, mode, mask = 0, qpn, cntn = 0;
+       struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+       struct ib_device *device;
+       struct sk_buff *msg;
+       int ret;
+
+       ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+                         nldev_policy, extack);
+       /* Currently only counter for QP is supported */
+       if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] ||
+           !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+           !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || !tb[RDMA_NLDEV_ATTR_STAT_MODE])
+               return -EINVAL;
+
+       if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP)
+               return -EINVAL;
+
+       index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+       device = ib_device_get_by_index(sock_net(skb->sk), index);
+       if (!device)
+               return -EINVAL;
+
+       port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+       if (!rdma_is_port_valid(device, port)) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+       if (!msg) {
+               ret = -ENOMEM;
+               goto err;
+       }
+       nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+                       RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                        RDMA_NLDEV_CMD_STAT_SET),
+                       0, 0);
+
+       mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]);
+       if (mode == RDMA_COUNTER_MODE_AUTO) {
+               if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK])
+                       mask = nla_get_u32(
+                               tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]);
+
+               ret = rdma_counter_set_auto_mode(device, port,
+                                                mask ? true : false, mask);
+               if (ret)
+                       goto err_msg;
+       } else {
+               qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
+               if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) {
+                       cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
+                       ret = rdma_counter_bind_qpn(device, port, qpn, cntn);
+               } else {
+                       ret = rdma_counter_bind_qpn_alloc(device, port,
+                                                         qpn, &cntn);
+               }
+               if (ret)
+                       goto err_msg;
+
+               if (fill_nldev_handle(msg, device) ||
+                   nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
+                   nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) ||
+                   nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) {
+                       ret = -EMSGSIZE;
+                       goto err_fill;
+               }
+       }
+
+       nlmsg_end(msg, nlh);
+       ib_device_put(device);
+       return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_fill:
+       rdma_counter_unbind_qpn(device, port, qpn, cntn);
+err_msg:
+       nlmsg_free(msg);
+err:
+       ib_device_put(device);
+       return ret;
+}
+
+static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+                              struct netlink_ext_ack *extack)
+{
+       struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+       struct ib_device *device;
+       struct sk_buff *msg;
+       u32 index, port, qpn, cntn;
+       int ret;
+
+       ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+                         nldev_policy, extack);
+       if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] ||
+           !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX] ||
+           !tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID] ||
+           !tb[RDMA_NLDEV_ATTR_RES_LQPN])
+               return -EINVAL;
+
+       if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP)
+               return -EINVAL;
+
+       index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+       device = ib_device_get_by_index(sock_net(skb->sk), index);
+       if (!device)
+               return -EINVAL;
+
+       port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+       if (!rdma_is_port_valid(device, port)) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+       if (!msg) {
+               ret = -ENOMEM;
+               goto err;
+       }
+       nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+                       RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                        RDMA_NLDEV_CMD_STAT_SET),
+                       0, 0);
+
+       cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
+       qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
+       ret = rdma_counter_unbind_qpn(device, port, qpn, cntn);
+       if (ret)
+               goto err_unbind;
+
+       if (fill_nldev_handle(msg, device) ||
+           nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
+           nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) ||
+           nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) {
+               ret = -EMSGSIZE;
+               goto err_fill;
+       }
+
+       nlmsg_end(msg, nlh);
+       ib_device_put(device);
+       return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_fill:
+       rdma_counter_bind_qpn(device, port, qpn, cntn);
+err_unbind:
+       nlmsg_free(msg);
+err:
+       ib_device_put(device);
+       return ret;
+}
+
+static int stat_get_doit_default_counter(struct sk_buff *skb,
+                                        struct nlmsghdr *nlh,
+                                        struct netlink_ext_ack *extack,
+                                        struct nlattr *tb[])
+{
+       struct rdma_hw_stats *stats;
+       struct nlattr *table_attr;
+       struct ib_device *device;
+       int ret, num_cnts, i;
+       struct sk_buff *msg;
+       u32 index, port;
+       u64 v;
+
+       if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+               return -EINVAL;
+
+       index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+       device = ib_device_get_by_index(sock_net(skb->sk), index);
+       if (!device)
+               return -EINVAL;
+
+       if (!device->ops.alloc_hw_stats || !device->ops.get_hw_stats) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+       if (!rdma_is_port_valid(device, port)) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+       if (!msg) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+                       RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                        RDMA_NLDEV_CMD_STAT_GET),
+                       0, 0);
+
+       if (fill_nldev_handle(msg, device) ||
+           nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) {
+               ret = -EMSGSIZE;
+               goto err_msg;
+       }
+
+       stats = device->port_data ? device->port_data[port].hw_stats : NULL;
+       if (stats == NULL) {
+               ret = -EINVAL;
+               goto err_msg;
+       }
+       mutex_lock(&stats->lock);
+
+       num_cnts = device->ops.get_hw_stats(device, stats, port, 0);
+       if (num_cnts < 0) {
+               ret = -EINVAL;
+               goto err_stats;
+       }
+
+       table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS);
+       if (!table_attr) {
+               ret = -EMSGSIZE;
+               goto err_stats;
+       }
+       for (i = 0; i < num_cnts; i++) {
+               v = stats->value[i] +
+                       rdma_counter_get_hwstat_value(device, port, i);
+               if (fill_stat_hwcounter_entry(msg, stats->names[i], v)) {
+                       ret = -EMSGSIZE;
+                       goto err_table;
+               }
+       }
+       nla_nest_end(msg, table_attr);
+
+       mutex_unlock(&stats->lock);
+       nlmsg_end(msg, nlh);
+       ib_device_put(device);
+       return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_table:
+       nla_nest_cancel(msg, table_attr);
+err_stats:
+       mutex_unlock(&stats->lock);
+err_msg:
+       nlmsg_free(msg);
+err:
+       ib_device_put(device);
+       return ret;
+}
+
+static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
+                           struct netlink_ext_ack *extack, struct nlattr *tb[])
+
+{
+       static enum rdma_nl_counter_mode mode;
+       static enum rdma_nl_counter_mask mask;
+       struct ib_device *device;
+       struct sk_buff *msg;
+       u32 index, port;
+       int ret;
+
+       if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID])
+               return nldev_res_get_counter_doit(skb, nlh, extack);
+
+       if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] ||
+           !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+               return -EINVAL;
+
+       index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+       device = ib_device_get_by_index(sock_net(skb->sk), index);
+       if (!device)
+               return -EINVAL;
+
+       port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+       if (!rdma_is_port_valid(device, port)) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+       if (!msg) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+                       RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                        RDMA_NLDEV_CMD_STAT_GET),
+                       0, 0);
+
+       ret = rdma_counter_get_mode(device, port, &mode, &mask);
+       if (ret)
+               goto err_msg;
+
+       if (fill_nldev_handle(msg, device) ||
+           nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
+           nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode))
+               goto err_msg;
+
+       if ((mode == RDMA_COUNTER_MODE_AUTO) &&
+           nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask))
+               goto err_msg;
+
+       nlmsg_end(msg, nlh);
+       ib_device_put(device);
+       return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_msg:
+       nlmsg_free(msg);
+err:
+       ib_device_put(device);
+       return ret;
+}
+
+static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+                              struct netlink_ext_ack *extack)
+{
+       struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+       int ret;
+
+       ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+                         nldev_policy, extack);
+       if (ret)
+               return -EINVAL;
+
+       if (!tb[RDMA_NLDEV_ATTR_STAT_RES])
+               return stat_get_doit_default_counter(skb, nlh, extack, tb);
+
+       switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) {
+       case RDMA_NLDEV_ATTR_RES_QP:
+               ret = stat_get_doit_qp(skb, nlh, extack, tb);
+               break;
+
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       return ret;
+}
+
+static int nldev_stat_get_dumpit(struct sk_buff *skb,
+                                struct netlink_callback *cb)
+{
+       struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+       int ret;
+
+       ret = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+                         nldev_policy, NULL);
+       if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES])
+               return -EINVAL;
+
+       switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) {
+       case RDMA_NLDEV_ATTR_RES_QP:
+               ret = nldev_res_get_counter_dumpit(skb, cb);
+               break;
+
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       return ret;
+}
+
 static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
        [RDMA_NLDEV_CMD_GET] = {
                .doit = nldev_get_doit,
                .dump = nldev_get_dumpit,
        },
+       [RDMA_NLDEV_CMD_GET_CHARDEV] = {
+               .doit = nldev_get_chardev,
+       },
        [RDMA_NLDEV_CMD_SET] = {
                .doit = nldev_set_doit,
                .flags = RDMA_NL_ADMIN_PERM,
@@ -1449,6 +2074,17 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
        },
        [RDMA_NLDEV_CMD_SYS_SET] = {
                .doit = nldev_set_sys_set_doit,
+       },
+       [RDMA_NLDEV_CMD_STAT_SET] = {
+               .doit = nldev_stat_set_doit,
+               .flags = RDMA_NL_ADMIN_PERM,
+       },
+       [RDMA_NLDEV_CMD_STAT_GET] = {
+               .doit = nldev_stat_get_doit,
+               .dump = nldev_stat_get_dumpit,
+       },
+       [RDMA_NLDEV_CMD_STAT_DEL] = {
+               .doit = nldev_stat_del_doit,
                .flags = RDMA_NL_ADMIN_PERM,
        },
 };
index 3b5ff2f7b5f8759416f56b69758c161c18afbfc2..bddff426ee0f0d37dfb32a9e03f97dc9cde7a9ad 100644 (file)
@@ -6,6 +6,7 @@
 #include <rdma/rdma_cm.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/restrack.h>
+#include <rdma/rdma_counter.h>
 #include <linux/mutex.h>
 #include <linux/sched/task.h>
 #include <linux/pid_namespace.h>
@@ -45,6 +46,7 @@ static const char *type2str(enum rdma_restrack_type type)
                [RDMA_RESTRACK_CM_ID] = "CM_ID",
                [RDMA_RESTRACK_MR] = "MR",
                [RDMA_RESTRACK_CTX] = "CTX",
+               [RDMA_RESTRACK_COUNTER] = "COUNTER",
        };
 
        return names[type];
@@ -169,6 +171,8 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res)
                return container_of(res, struct ib_mr, res)->device;
        case RDMA_RESTRACK_CTX:
                return container_of(res, struct ib_ucontext, res)->device;
+       case RDMA_RESTRACK_COUNTER:
+               return container_of(res, struct rdma_counter, res)->device;
        default:
                WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
                return NULL;
@@ -190,6 +194,20 @@ void rdma_restrack_set_task(struct rdma_restrack_entry *res,
 }
 EXPORT_SYMBOL(rdma_restrack_set_task);
 
+/**
+ * rdma_restrack_attach_task() - attach the task onto this resource
+ * @res:  resource entry
+ * @task: the task to attach, the current task will be used if it is NULL.
+ */
+void rdma_restrack_attach_task(struct rdma_restrack_entry *res,
+                              struct task_struct *task)
+{
+       if (res->task)
+               put_task_struct(res->task);
+       get_task_struct(task);
+       res->task = task;
+}
+
 static void rdma_restrack_add(struct rdma_restrack_entry *res)
 {
        struct ib_device *dev = res_to_dev(res);
@@ -203,15 +221,22 @@ static void rdma_restrack_add(struct rdma_restrack_entry *res)
 
        kref_init(&res->kref);
        init_completion(&res->comp);
-       if (res->type != RDMA_RESTRACK_QP)
-               ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b,
-                               &rt->next_id, GFP_KERNEL);
-       else {
+       if (res->type == RDMA_RESTRACK_QP) {
                /* Special case to ensure that LQPN points to right QP */
                struct ib_qp *qp = container_of(res, struct ib_qp, res);
 
                ret = xa_insert(&rt->xa, qp->qp_num, res, GFP_KERNEL);
                res->id = ret ? 0 : qp->qp_num;
+       } else if (res->type == RDMA_RESTRACK_COUNTER) {
+               /* Special case to ensure that cntn points to right counter */
+               struct rdma_counter *counter;
+
+               counter = container_of(res, struct rdma_counter, res);
+               ret = xa_insert(&rt->xa, counter->id, res, GFP_KERNEL);
+               res->id = ret ? 0 : counter->id;
+       } else {
+               ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b,
+                                     &rt->next_id, GFP_KERNEL);
        }
 
        if (!ret)
@@ -237,7 +262,8 @@ EXPORT_SYMBOL(rdma_restrack_kadd);
  */
 void rdma_restrack_uadd(struct rdma_restrack_entry *res)
 {
-       if (res->type != RDMA_RESTRACK_CM_ID)
+       if ((res->type != RDMA_RESTRACK_CM_ID) &&
+           (res->type != RDMA_RESTRACK_COUNTER))
                res->task = NULL;
 
        if (!res->task)
@@ -323,3 +349,16 @@ out:
        }
 }
 EXPORT_SYMBOL(rdma_restrack_del);
+
+bool rdma_is_visible_in_pid_ns(struct rdma_restrack_entry *res)
+{
+       /*
+        * 1. Kern resources should be visible in init
+        *    namespace only
+        * 2. Present only resources visible in the current
+        *     namespace
+        */
+       if (rdma_is_kernel_res(res))
+               return task_active_pid_ns(current) == &init_pid_ns;
+       return task_active_pid_ns(current) == task_active_pid_ns(res->task);
+}
index 09a1fbdf578ed1a87d61ecb9aeca9b314b15424a..7bd177cc0a6179c635532ed7f718ab432d07a069 100644 (file)
@@ -25,4 +25,7 @@ struct rdma_restrack_root {
 
 int rdma_restrack_init(struct ib_device *dev);
 void rdma_restrack_clean(struct ib_device *dev);
+void rdma_restrack_attach_task(struct rdma_restrack_entry *res,
+                              struct task_struct *task);
+bool rdma_is_visible_in_pid_ns(struct rdma_restrack_entry *res);
 #endif /* _RDMA_CORE_RESTRACK_H_ */
index 32ca8429eaaea17448b456ada31d59b6cc467b7d..dce06108c8c3fe39d9bd2d28ae9d88dd6cab82c7 100644 (file)
@@ -51,10 +51,34 @@ static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num,
        return false;
 }
 
-static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev)
+static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev,
+                                          bool pi_support)
 {
+       u32 max_pages;
+
+       if (pi_support)
+               max_pages = dev->attrs.max_pi_fast_reg_page_list_len;
+       else
+               max_pages = dev->attrs.max_fast_reg_page_list_len;
+
        /* arbitrary limit to avoid allocating gigantic resources */
-       return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256);
+       return min_t(u32, max_pages, 256);
+}
+
+static inline int rdma_rw_inv_key(struct rdma_rw_reg_ctx *reg)
+{
+       int count = 0;
+
+       if (reg->mr->need_inval) {
+               reg->inv_wr.opcode = IB_WR_LOCAL_INV;
+               reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
+               reg->inv_wr.next = &reg->reg_wr.wr;
+               count++;
+       } else {
+               reg->inv_wr.next = NULL;
+       }
+
+       return count;
 }
 
 /* Caller must have zero-initialized *reg. */
@@ -62,7 +86,8 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
                struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
                u32 sg_cnt, u32 offset)
 {
-       u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+       u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+                                                   qp->integrity_en);
        u32 nents = min(sg_cnt, pages_per_mr);
        int count = 0, ret;
 
@@ -70,14 +95,7 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
        if (!reg->mr)
                return -EAGAIN;
 
-       if (reg->mr->need_inval) {
-               reg->inv_wr.opcode = IB_WR_LOCAL_INV;
-               reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
-               reg->inv_wr.next = &reg->reg_wr.wr;
-               count++;
-       } else {
-               reg->inv_wr.next = NULL;
-       }
+       count += rdma_rw_inv_key(reg);
 
        ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE);
        if (ret < 0 || ret < nents) {
@@ -102,7 +120,8 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
        struct rdma_rw_reg_ctx *prev = NULL;
-       u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+       u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+                                                   qp->integrity_en);
        int i, j, ret = 0, count = 0;
 
        ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr;
@@ -343,13 +362,14 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
        struct ib_device *dev = qp->pd->device;
-       u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+       u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+                                                   qp->integrity_en);
        struct ib_rdma_wr *rdma_wr;
-       struct ib_send_wr *prev_wr = NULL;
        int count = 0, ret;
 
        if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) {
-               pr_err("SG count too large\n");
+               pr_err("SG count too large: sg_cnt=%d, prot_sg_cnt=%d, pages_per_mr=%d\n",
+                      sg_cnt, prot_sg_cnt, pages_per_mr);
                return -EINVAL;
        }
 
@@ -358,75 +378,58 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                return -ENOMEM;
        sg_cnt = ret;
 
-       ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir);
-       if (!ret) {
-               ret = -ENOMEM;
-               goto out_unmap_sg;
+       if (prot_sg_cnt) {
+               ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir);
+               if (!ret) {
+                       ret = -ENOMEM;
+                       goto out_unmap_sg;
+               }
+               prot_sg_cnt = ret;
        }
-       prot_sg_cnt = ret;
 
        ctx->type = RDMA_RW_SIG_MR;
        ctx->nr_ops = 1;
-       ctx->sig = kcalloc(1, sizeof(*ctx->sig), GFP_KERNEL);
-       if (!ctx->sig) {
+       ctx->reg = kcalloc(1, sizeof(*ctx->reg), GFP_KERNEL);
+       if (!ctx->reg) {
                ret = -ENOMEM;
                goto out_unmap_prot_sg;
        }
 
-       ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->data, sg, sg_cnt, 0);
-       if (ret < 0)
-               goto out_free_ctx;
-       count += ret;
-       prev_wr = &ctx->sig->data.reg_wr.wr;
-
-       ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->prot,
-                                 prot_sg, prot_sg_cnt, 0);
-       if (ret < 0)
-               goto out_destroy_data_mr;
-       count += ret;
-
-       if (ctx->sig->prot.inv_wr.next)
-               prev_wr->next = &ctx->sig->prot.inv_wr;
-       else
-               prev_wr->next = &ctx->sig->prot.reg_wr.wr;
-       prev_wr = &ctx->sig->prot.reg_wr.wr;
-
-       ctx->sig->sig_mr = ib_mr_pool_get(qp, &qp->sig_mrs);
-       if (!ctx->sig->sig_mr) {
+       ctx->reg->mr = ib_mr_pool_get(qp, &qp->sig_mrs);
+       if (!ctx->reg->mr) {
                ret = -EAGAIN;
-               goto out_destroy_prot_mr;
+               goto out_free_ctx;
        }
 
-       if (ctx->sig->sig_mr->need_inval) {
-               memset(&ctx->sig->sig_inv_wr, 0, sizeof(ctx->sig->sig_inv_wr));
+       count += rdma_rw_inv_key(ctx->reg);
 
-               ctx->sig->sig_inv_wr.opcode = IB_WR_LOCAL_INV;
-               ctx->sig->sig_inv_wr.ex.invalidate_rkey = ctx->sig->sig_mr->rkey;
+       memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs));
 
-               prev_wr->next = &ctx->sig->sig_inv_wr;
-               prev_wr = &ctx->sig->sig_inv_wr;
+       ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sg_cnt, NULL, prot_sg,
+                             prot_sg_cnt, NULL, SZ_4K);
+       if (unlikely(ret)) {
+               pr_err("failed to map PI sg (%d)\n", sg_cnt + prot_sg_cnt);
+               goto out_destroy_sig_mr;
        }
 
-       ctx->sig->sig_wr.wr.opcode = IB_WR_REG_SIG_MR;
-       ctx->sig->sig_wr.wr.wr_cqe = NULL;
-       ctx->sig->sig_wr.wr.sg_list = &ctx->sig->data.sge;
-       ctx->sig->sig_wr.wr.num_sge = 1;
-       ctx->sig->sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE;
-       ctx->sig->sig_wr.sig_attrs = sig_attrs;
-       ctx->sig->sig_wr.sig_mr = ctx->sig->sig_mr;
-       if (prot_sg_cnt)
-               ctx->sig->sig_wr.prot = &ctx->sig->prot.sge;
-       prev_wr->next = &ctx->sig->sig_wr.wr;
-       prev_wr = &ctx->sig->sig_wr.wr;
+       ctx->reg->reg_wr.wr.opcode = IB_WR_REG_MR_INTEGRITY;
+       ctx->reg->reg_wr.wr.wr_cqe = NULL;
+       ctx->reg->reg_wr.wr.num_sge = 0;
+       ctx->reg->reg_wr.wr.send_flags = 0;
+       ctx->reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
+       if (rdma_protocol_iwarp(qp->device, port_num))
+               ctx->reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
+       ctx->reg->reg_wr.mr = ctx->reg->mr;
+       ctx->reg->reg_wr.key = ctx->reg->mr->lkey;
        count++;
 
-       ctx->sig->sig_sge.addr = 0;
-       ctx->sig->sig_sge.length = ctx->sig->data.sge.length;
-       if (sig_attrs->wire.sig_type != IB_SIG_TYPE_NONE)
-               ctx->sig->sig_sge.length += ctx->sig->prot.sge.length;
+       ctx->reg->sge.addr = ctx->reg->mr->iova;
+       ctx->reg->sge.length = ctx->reg->mr->length;
+       if (sig_attrs->wire.sig_type == IB_SIG_TYPE_NONE)
+               ctx->reg->sge.length -= ctx->reg->mr->sig_attrs->meta_length;
 
-       rdma_wr = &ctx->sig->data.wr;
-       rdma_wr->wr.sg_list = &ctx->sig->sig_sge;
+       rdma_wr = &ctx->reg->wr;
+       rdma_wr->wr.sg_list = &ctx->reg->sge;
        rdma_wr->wr.num_sge = 1;
        rdma_wr->remote_addr = remote_addr;
        rdma_wr->rkey = rkey;
@@ -434,21 +437,18 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
        else
                rdma_wr->wr.opcode = IB_WR_RDMA_READ;
-       prev_wr->next = &rdma_wr->wr;
-       prev_wr = &rdma_wr->wr;
+       ctx->reg->reg_wr.wr.next = &rdma_wr->wr;
        count++;
 
        return count;
 
-out_destroy_prot_mr:
-       if (prot_sg_cnt)
-               ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
-out_destroy_data_mr:
-       ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
+out_destroy_sig_mr:
+       ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
 out_free_ctx:
-       kfree(ctx->sig);
+       kfree(ctx->reg);
 out_unmap_prot_sg:
-       ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
+       if (prot_sg_cnt)
+               ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
 out_unmap_sg:
        ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
        return ret;
@@ -491,22 +491,8 @@ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 
        switch (ctx->type) {
        case RDMA_RW_SIG_MR:
-               rdma_rw_update_lkey(&ctx->sig->data, true);
-               if (ctx->sig->prot.mr)
-                       rdma_rw_update_lkey(&ctx->sig->prot, true);
-       
-               ctx->sig->sig_mr->need_inval = true;
-               ib_update_fast_reg_key(ctx->sig->sig_mr,
-                       ib_inc_rkey(ctx->sig->sig_mr->lkey));
-               ctx->sig->sig_sge.lkey = ctx->sig->sig_mr->lkey;
-
-               if (ctx->sig->data.inv_wr.next)
-                       first_wr = &ctx->sig->data.inv_wr;
-               else
-                       first_wr = &ctx->sig->data.reg_wr.wr;
-               last_wr = &ctx->sig->data.wr.wr;
-               break;
        case RDMA_RW_MR:
+               /* fallthrough */
                for (i = 0; i < ctx->nr_ops; i++) {
                        rdma_rw_update_lkey(&ctx->reg[i],
                                ctx->reg[i].wr.wr.opcode !=
@@ -605,7 +591,7 @@ EXPORT_SYMBOL(rdma_rw_ctx_destroy);
 
 /**
  * rdma_rw_ctx_destroy_signature - release all resources allocated by
- *     rdma_rw_ctx_init_signature
+ *     rdma_rw_ctx_signature_init
  * @ctx:       context to release
  * @qp:                queue pair to operate on
  * @port_num:  port num to which the connection is bound
@@ -623,16 +609,12 @@ void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
        if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR))
                return;
 
-       ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
-       ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+       ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
+       kfree(ctx->reg);
 
-       if (ctx->sig->prot.mr) {
-               ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
+       ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+       if (prot_sg_cnt)
                ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
-       }
-
-       ib_mr_pool_put(qp, &qp->sig_mrs, ctx->sig->sig_mr);
-       kfree(ctx->sig);
 }
 EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
 
@@ -653,7 +635,7 @@ unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num,
        unsigned int mr_pages;
 
        if (rdma_rw_can_use_mr(device, port_num))
-               mr_pages = rdma_rw_fr_page_list_len(device);
+               mr_pages = rdma_rw_fr_page_list_len(device, false);
        else
                mr_pages = device->attrs.max_sge_rd;
        return DIV_ROUND_UP(maxpages, mr_pages);
@@ -679,9 +661,8 @@ void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
         * we'll need two additional MRs for the registrations and the
         * invalidation.
         */
-       if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN)
-               factor += 6;    /* (inv + reg) * (data + prot + sig) */
-       else if (rdma_rw_can_use_mr(dev, attr->port_num))
+       if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN ||
+           rdma_rw_can_use_mr(dev, attr->port_num))
                factor += 2;    /* inv + reg */
 
        attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs;
@@ -697,20 +678,22 @@ void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
 int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
 {
        struct ib_device *dev = qp->pd->device;
-       u32 nr_mrs = 0, nr_sig_mrs = 0;
+       u32 nr_mrs = 0, nr_sig_mrs = 0, max_num_sg = 0;
        int ret = 0;
 
-       if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) {
+       if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) {
                nr_sig_mrs = attr->cap.max_rdma_ctxs;
-               nr_mrs = attr->cap.max_rdma_ctxs * 2;
+               nr_mrs = attr->cap.max_rdma_ctxs;
+               max_num_sg = rdma_rw_fr_page_list_len(dev, true);
        } else if (rdma_rw_can_use_mr(dev, attr->port_num)) {
                nr_mrs = attr->cap.max_rdma_ctxs;
+               max_num_sg = rdma_rw_fr_page_list_len(dev, false);
        }
 
        if (nr_mrs) {
                ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs,
                                IB_MR_TYPE_MEM_REG,
-                               rdma_rw_fr_page_list_len(dev));
+                               max_num_sg, 0);
                if (ret) {
                        pr_err("%s: failed to allocated %d MRs\n",
                                __func__, nr_mrs);
@@ -720,10 +703,10 @@ int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
 
        if (nr_sig_mrs) {
                ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs,
-                               IB_MR_TYPE_SIGNATURE, 2);
+                               IB_MR_TYPE_INTEGRITY, max_num_sg, max_num_sg);
                if (ret) {
                        pr_err("%s: failed to allocated %d SIG MRs\n",
-                               __func__, nr_mrs);
+                               __func__, nr_sig_mrs);
                        goto out_free_rdma_mrs;
                }
        }
index c78d0c9646ae5d990ccf72a4cf9b531b4ccf57de..b477295a96c2a6bb2ee47cd4950060f08b92cd1c 100644 (file)
@@ -43,6 +43,7 @@
 #include <rdma/ib_mad.h>
 #include <rdma/ib_pma.h>
 #include <rdma/ib_cache.h>
+#include <rdma/rdma_counter.h>
 
 struct ib_port;
 
@@ -800,9 +801,12 @@ static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats,
        return 0;
 }
 
-static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf)
+static ssize_t print_hw_stat(struct ib_device *dev, int port_num,
+                            struct rdma_hw_stats *stats, int index, char *buf)
 {
-       return sprintf(buf, "%llu\n", stats->value[index]);
+       u64 v = rdma_counter_get_hwstat_value(dev, port_num, index);
+
+       return sprintf(buf, "%llu\n", stats->value[index] + v);
 }
 
 static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr,
@@ -828,7 +832,7 @@ static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr,
        ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index);
        if (ret)
                goto unlock;
-       ret = print_hw_stat(stats, hsa->index, buf);
+       ret = print_hw_stat(dev, hsa->port_num, stats, hsa->index, buf);
 unlock:
        mutex_unlock(&stats->lock);
 
@@ -999,6 +1003,8 @@ static void setup_hw_stats(struct ib_device *device, struct ib_port *port,
                        goto err;
                port->hw_stats_ag = hsag;
                port->hw_stats = stats;
+               if (device->port_data)
+                       device->port_data[port_num].hw_stats = stats;
        } else {
                struct kobject *kobj = &device->dev.kobj;
                ret = sysfs_create_group(kobj, hsag);
@@ -1289,6 +1295,8 @@ const struct attribute_group ib_dev_attr_group = {
 
 void ib_free_port_attrs(struct ib_core_device *coredev)
 {
+       struct ib_device *device = rdma_device_to_ibdev(&coredev->dev);
+       bool is_full_dev = &device->coredev == coredev;
        struct kobject *p, *t;
 
        list_for_each_entry_safe(p, t, &coredev->port_list, entry) {
@@ -1298,6 +1306,8 @@ void ib_free_port_attrs(struct ib_core_device *coredev)
                if (port->hw_stats_ag)
                        free_hsag(&port->kobj, port->hw_stats_ag);
                kfree(port->hw_stats);
+               if (device->port_data && is_full_dev)
+                       device->port_data[port->port_num].hw_stats = NULL;
 
                if (port->pma_table)
                        sysfs_remove_group(p, port->pma_table);
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
deleted file mode 100644 (file)
index 8e7da2d..0000000
+++ /dev/null
@@ -1,1350 +0,0 @@
-/*
- * Copyright (c) 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *     copyright notice, this list of conditions and the following
- *     disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *     copyright notice, this list of conditions and the following
- *     disclaimer in the documentation and/or other materials
- *     provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/completion.h>
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/err.h>
-#include <linux/poll.h>
-#include <linux/sched.h>
-#include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/cdev.h>
-#include <linux/xarray.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-
-#include <linux/nospec.h>
-
-#include <linux/uaccess.h>
-
-#include <rdma/ib.h>
-#include <rdma/ib_cm.h>
-#include <rdma/ib_user_cm.h>
-#include <rdma/ib_marshall.h>
-
-#include "core_priv.h"
-
-MODULE_AUTHOR("Libor Michalek");
-MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access");
-MODULE_LICENSE("Dual BSD/GPL");
-
-struct ib_ucm_device {
-       int                     devnum;
-       struct cdev             cdev;
-       struct device           dev;
-       struct ib_device        *ib_dev;
-};
-
-struct ib_ucm_file {
-       struct mutex file_mutex;
-       struct file *filp;
-       struct ib_ucm_device *device;
-
-       struct list_head  ctxs;
-       struct list_head  events;
-       wait_queue_head_t poll_wait;
-};
-
-struct ib_ucm_context {
-       int                 id;
-       struct completion   comp;
-       atomic_t            ref;
-       int                 events_reported;
-
-       struct ib_ucm_file *file;
-       struct ib_cm_id    *cm_id;
-       __u64              uid;
-
-       struct list_head    events;    /* list of pending events. */
-       struct list_head    file_list; /* member in file ctx list */
-};
-
-struct ib_ucm_event {
-       struct ib_ucm_context *ctx;
-       struct list_head file_list; /* member in file event list */
-       struct list_head ctx_list;  /* member in ctx event list */
-
-       struct ib_cm_id *cm_id;
-       struct ib_ucm_event_resp resp;
-       void *data;
-       void *info;
-       int data_len;
-       int info_len;
-};
-
-enum {
-       IB_UCM_MAJOR = 231,
-       IB_UCM_BASE_MINOR = 224,
-       IB_UCM_MAX_DEVICES = RDMA_MAX_PORTS,
-       IB_UCM_NUM_FIXED_MINOR = 32,
-       IB_UCM_NUM_DYNAMIC_MINOR = IB_UCM_MAX_DEVICES - IB_UCM_NUM_FIXED_MINOR,
-};
-
-#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
-static dev_t dynamic_ucm_dev;
-
-static void ib_ucm_add_one(struct ib_device *device);
-static void ib_ucm_remove_one(struct ib_device *device, void *client_data);
-
-static struct ib_client ucm_client = {
-       .name   = "ucm",
-       .add    = ib_ucm_add_one,
-       .remove = ib_ucm_remove_one
-};
-
-static DEFINE_XARRAY_ALLOC(ctx_id_table);
-static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES);
-
-static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id)
-{
-       struct ib_ucm_context *ctx;
-
-       xa_lock(&ctx_id_table);
-       ctx = xa_load(&ctx_id_table, id);
-       if (!ctx)
-               ctx = ERR_PTR(-ENOENT);
-       else if (ctx->file != file)
-               ctx = ERR_PTR(-EINVAL);
-       else
-               atomic_inc(&ctx->ref);
-       xa_unlock(&ctx_id_table);
-
-       return ctx;
-}
-
-static void ib_ucm_ctx_put(struct ib_ucm_context *ctx)
-{
-       if (atomic_dec_and_test(&ctx->ref))
-               complete(&ctx->comp);
-}
-
-static inline int ib_ucm_new_cm_id(int event)
-{
-       return event == IB_CM_REQ_RECEIVED || event == IB_CM_SIDR_REQ_RECEIVED;
-}
-
-static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx)
-{
-       struct ib_ucm_event *uevent;
-
-       mutex_lock(&ctx->file->file_mutex);
-       list_del(&ctx->file_list);
-       while (!list_empty(&ctx->events)) {
-
-               uevent = list_entry(ctx->events.next,
-                                   struct ib_ucm_event, ctx_list);
-               list_del(&uevent->file_list);
-               list_del(&uevent->ctx_list);
-               mutex_unlock(&ctx->file->file_mutex);
-
-               /* clear incoming connections. */
-               if (ib_ucm_new_cm_id(uevent->resp.event))
-                       ib_destroy_cm_id(uevent->cm_id);
-
-               kfree(uevent);
-               mutex_lock(&ctx->file->file_mutex);
-       }
-       mutex_unlock(&ctx->file->file_mutex);
-}
-
-static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file)
-{
-       struct ib_ucm_context *ctx;
-
-       ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
-       if (!ctx)
-               return NULL;
-
-       atomic_set(&ctx->ref, 1);
-       init_completion(&ctx->comp);
-       ctx->file = file;
-       INIT_LIST_HEAD(&ctx->events);
-
-       if (xa_alloc(&ctx_id_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL))
-               goto error;
-
-       list_add_tail(&ctx->file_list, &file->ctxs);
-       return ctx;
-
-error:
-       kfree(ctx);
-       return NULL;
-}
-
-static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq,
-                                const struct ib_cm_req_event_param *kreq)
-{
-       ureq->remote_ca_guid             = kreq->remote_ca_guid;
-       ureq->remote_qkey                = kreq->remote_qkey;
-       ureq->remote_qpn                 = kreq->remote_qpn;
-       ureq->qp_type                    = kreq->qp_type;
-       ureq->starting_psn               = kreq->starting_psn;
-       ureq->responder_resources        = kreq->responder_resources;
-       ureq->initiator_depth            = kreq->initiator_depth;
-       ureq->local_cm_response_timeout  = kreq->local_cm_response_timeout;
-       ureq->flow_control               = kreq->flow_control;
-       ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout;
-       ureq->retry_count                = kreq->retry_count;
-       ureq->rnr_retry_count            = kreq->rnr_retry_count;
-       ureq->srq                        = kreq->srq;
-       ureq->port                       = kreq->port;
-
-       ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path);
-       if (kreq->alternate_path)
-               ib_copy_path_rec_to_user(&ureq->alternate_path,
-                                        kreq->alternate_path);
-}
-
-static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep,
-                                const struct ib_cm_rep_event_param *krep)
-{
-       urep->remote_ca_guid      = krep->remote_ca_guid;
-       urep->remote_qkey         = krep->remote_qkey;
-       urep->remote_qpn          = krep->remote_qpn;
-       urep->starting_psn        = krep->starting_psn;
-       urep->responder_resources = krep->responder_resources;
-       urep->initiator_depth     = krep->initiator_depth;
-       urep->target_ack_delay    = krep->target_ack_delay;
-       urep->failover_accepted   = krep->failover_accepted;
-       urep->flow_control        = krep->flow_control;
-       urep->rnr_retry_count     = krep->rnr_retry_count;
-       urep->srq                 = krep->srq;
-}
-
-static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep,
-                                     const struct ib_cm_sidr_rep_event_param *krep)
-{
-       urep->status = krep->status;
-       urep->qkey   = krep->qkey;
-       urep->qpn    = krep->qpn;
-};
-
-static int ib_ucm_event_process(const struct ib_cm_event *evt,
-                               struct ib_ucm_event *uvt)
-{
-       void *info = NULL;
-
-       switch (evt->event) {
-       case IB_CM_REQ_RECEIVED:
-               ib_ucm_event_req_get(&uvt->resp.u.req_resp,
-                                    &evt->param.req_rcvd);
-               uvt->data_len      = IB_CM_REQ_PRIVATE_DATA_SIZE;
-               uvt->resp.present  = IB_UCM_PRES_PRIMARY;
-               uvt->resp.present |= (evt->param.req_rcvd.alternate_path ?
-                                     IB_UCM_PRES_ALTERNATE : 0);
-               break;
-       case IB_CM_REP_RECEIVED:
-               ib_ucm_event_rep_get(&uvt->resp.u.rep_resp,
-                                    &evt->param.rep_rcvd);
-               uvt->data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
-               break;
-       case IB_CM_RTU_RECEIVED:
-               uvt->data_len = IB_CM_RTU_PRIVATE_DATA_SIZE;
-               uvt->resp.u.send_status = evt->param.send_status;
-               break;
-       case IB_CM_DREQ_RECEIVED:
-               uvt->data_len = IB_CM_DREQ_PRIVATE_DATA_SIZE;
-               uvt->resp.u.send_status = evt->param.send_status;
-               break;
-       case IB_CM_DREP_RECEIVED:
-               uvt->data_len = IB_CM_DREP_PRIVATE_DATA_SIZE;
-               uvt->resp.u.send_status = evt->param.send_status;
-               break;
-       case IB_CM_MRA_RECEIVED:
-               uvt->resp.u.mra_resp.timeout =
-                                       evt->param.mra_rcvd.service_timeout;
-               uvt->data_len = IB_CM_MRA_PRIVATE_DATA_SIZE;
-               break;
-       case IB_CM_REJ_RECEIVED:
-               uvt->resp.u.rej_resp.reason = evt->param.rej_rcvd.reason;
-               uvt->data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
-               uvt->info_len = evt->param.rej_rcvd.ari_length;
-               info          = evt->param.rej_rcvd.ari;
-               break;
-       case IB_CM_LAP_RECEIVED:
-               ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path,
-                                        evt->param.lap_rcvd.alternate_path);
-               uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE;
-               uvt->resp.present = IB_UCM_PRES_ALTERNATE;
-               break;
-       case IB_CM_APR_RECEIVED:
-               uvt->resp.u.apr_resp.status = evt->param.apr_rcvd.ap_status;
-               uvt->data_len = IB_CM_APR_PRIVATE_DATA_SIZE;
-               uvt->info_len = evt->param.apr_rcvd.info_len;
-               info          = evt->param.apr_rcvd.apr_info;
-               break;
-       case IB_CM_SIDR_REQ_RECEIVED:
-               uvt->resp.u.sidr_req_resp.pkey =
-                                       evt->param.sidr_req_rcvd.pkey;
-               uvt->resp.u.sidr_req_resp.port =
-                                       evt->param.sidr_req_rcvd.port;
-               uvt->data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE;
-               break;
-       case IB_CM_SIDR_REP_RECEIVED:
-               ib_ucm_event_sidr_rep_get(&uvt->resp.u.sidr_rep_resp,
-                                         &evt->param.sidr_rep_rcvd);
-               uvt->data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
-               uvt->info_len = evt->param.sidr_rep_rcvd.info_len;
-               info          = evt->param.sidr_rep_rcvd.info;
-               break;
-       default:
-               uvt->resp.u.send_status = evt->param.send_status;
-               break;
-       }
-
-       if (uvt->data_len) {
-               uvt->data = kmemdup(evt->private_data, uvt->data_len, GFP_KERNEL);
-               if (!uvt->data)
-                       goto err1;
-
-               uvt->resp.present |= IB_UCM_PRES_DATA;
-       }
-
-       if (uvt->info_len) {
-               uvt->info = kmemdup(info, uvt->info_len, GFP_KERNEL);
-               if (!uvt->info)
-                       goto err2;
-
-               uvt->resp.present |= IB_UCM_PRES_INFO;
-       }
-       return 0;
-
-err2:
-       kfree(uvt->data);
-err1:
-       return -ENOMEM;
-}
-
-static int ib_ucm_event_handler(struct ib_cm_id *cm_id,
-                               const struct ib_cm_event *event)
-{
-       struct ib_ucm_event *uevent;
-       struct ib_ucm_context *ctx;
-       int result = 0;
-
-       ctx = cm_id->context;
-
-       uevent = kzalloc(sizeof *uevent, GFP_KERNEL);
-       if (!uevent)
-               goto err1;
-
-       uevent->ctx = ctx;
-       uevent->cm_id = cm_id;
-       uevent->resp.uid = ctx->uid;
-       uevent->resp.id = ctx->id;
-       uevent->resp.event = event->event;
-
-       result = ib_ucm_event_process(event, uevent);
-       if (result)
-               goto err2;
-
-       mutex_lock(&ctx->file->file_mutex);
-       list_add_tail(&uevent->file_list, &ctx->file->events);
-       list_add_tail(&uevent->ctx_list, &ctx->events);
-       wake_up_interruptible(&ctx->file->poll_wait);
-       mutex_unlock(&ctx->file->file_mutex);
-       return 0;
-
-err2:
-       kfree(uevent);
-err1:
-       /* Destroy new cm_id's */
-       return ib_ucm_new_cm_id(event->event);
-}
-
-static ssize_t ib_ucm_event(struct ib_ucm_file *file,
-                           const char __user *inbuf,
-                           int in_len, int out_len)
-{
-       struct ib_ucm_context *ctx;
-       struct ib_ucm_event_get cmd;
-       struct ib_ucm_event *uevent;
-       int result = 0;
-
-       if (out_len < sizeof(struct ib_ucm_event_resp))
-               return -ENOSPC;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       mutex_lock(&file->file_mutex);
-       while (list_empty(&file->events)) {
-               mutex_unlock(&file->file_mutex);
-
-               if (file->filp->f_flags & O_NONBLOCK)
-                       return -EAGAIN;
-
-               if (wait_event_interruptible(file->poll_wait,
-                                            !list_empty(&file->events)))
-                       return -ERESTARTSYS;
-
-               mutex_lock(&file->file_mutex);
-       }
-
-       uevent = list_entry(file->events.next, struct ib_ucm_event, file_list);
-
-       if (ib_ucm_new_cm_id(uevent->resp.event)) {
-               ctx = ib_ucm_ctx_alloc(file);
-               if (!ctx) {
-                       result = -ENOMEM;
-                       goto done;
-               }
-
-               ctx->cm_id = uevent->cm_id;
-               ctx->cm_id->context = ctx;
-               uevent->resp.id = ctx->id;
-       }
-
-       if (copy_to_user(u64_to_user_ptr(cmd.response),
-                        &uevent->resp, sizeof(uevent->resp))) {
-               result = -EFAULT;
-               goto done;
-       }
-
-       if (uevent->data) {
-               if (cmd.data_len < uevent->data_len) {
-                       result = -ENOMEM;
-                       goto done;
-               }
-               if (copy_to_user(u64_to_user_ptr(cmd.data),
-                                uevent->data, uevent->data_len)) {
-                       result = -EFAULT;
-                       goto done;
-               }
-       }
-
-       if (uevent->info) {
-               if (cmd.info_len < uevent->info_len) {
-                       result = -ENOMEM;
-                       goto done;
-               }
-               if (copy_to_user(u64_to_user_ptr(cmd.info),
-                                uevent->info, uevent->info_len)) {
-                       result = -EFAULT;
-                       goto done;
-               }
-       }
-
-       list_del(&uevent->file_list);
-       list_del(&uevent->ctx_list);
-       uevent->ctx->events_reported++;
-
-       kfree(uevent->data);
-       kfree(uevent->info);
-       kfree(uevent);
-done:
-       mutex_unlock(&file->file_mutex);
-       return result;
-}
-
-static ssize_t ib_ucm_create_id(struct ib_ucm_file *file,
-                               const char __user *inbuf,
-                               int in_len, int out_len)
-{
-       struct ib_ucm_create_id cmd;
-       struct ib_ucm_create_id_resp resp;
-       struct ib_ucm_context *ctx;
-       int result;
-
-       if (out_len < sizeof(resp))
-               return -ENOSPC;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       mutex_lock(&file->file_mutex);
-       ctx = ib_ucm_ctx_alloc(file);
-       mutex_unlock(&file->file_mutex);
-       if (!ctx)
-               return -ENOMEM;
-
-       ctx->uid = cmd.uid;
-       ctx->cm_id = ib_create_cm_id(file->device->ib_dev,
-                                    ib_ucm_event_handler, ctx);
-       if (IS_ERR(ctx->cm_id)) {
-               result = PTR_ERR(ctx->cm_id);
-               goto err1;
-       }
-
-       resp.id = ctx->id;
-       if (copy_to_user(u64_to_user_ptr(cmd.response),
-                        &resp, sizeof(resp))) {
-               result = -EFAULT;
-               goto err2;
-       }
-       return 0;
-
-err2:
-       ib_destroy_cm_id(ctx->cm_id);
-err1:
-       xa_erase(&ctx_id_table, ctx->id);
-       kfree(ctx);
-       return result;
-}
-
-static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file,
-                                const char __user *inbuf,
-                                int in_len, int out_len)
-{
-       struct ib_ucm_destroy_id cmd;
-       struct ib_ucm_destroy_id_resp resp;
-       struct ib_ucm_context *ctx;
-       int result = 0;
-
-       if (out_len < sizeof(resp))
-               return -ENOSPC;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       xa_lock(&ctx_id_table);
-       ctx = xa_load(&ctx_id_table, cmd.id);
-       if (!ctx)
-               ctx = ERR_PTR(-ENOENT);
-       else if (ctx->file != file)
-               ctx = ERR_PTR(-EINVAL);
-       else
-               __xa_erase(&ctx_id_table, ctx->id);
-       xa_unlock(&ctx_id_table);
-
-       if (IS_ERR(ctx))
-               return PTR_ERR(ctx);
-
-       ib_ucm_ctx_put(ctx);
-       wait_for_completion(&ctx->comp);
-
-       /* No new events will be generated after destroying the cm_id. */
-       ib_destroy_cm_id(ctx->cm_id);
-       /* Cleanup events not yet reported to the user. */
-       ib_ucm_cleanup_events(ctx);
-
-       resp.events_reported = ctx->events_reported;
-       if (copy_to_user(u64_to_user_ptr(cmd.response),
-                        &resp, sizeof(resp)))
-               result = -EFAULT;
-
-       kfree(ctx);
-       return result;
-}
-
-static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file,
-                             const char __user *inbuf,
-                             int in_len, int out_len)
-{
-       struct ib_ucm_attr_id_resp resp;
-       struct ib_ucm_attr_id cmd;
-       struct ib_ucm_context *ctx;
-       int result = 0;
-
-       if (out_len < sizeof(resp))
-               return -ENOSPC;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (IS_ERR(ctx))
-               return PTR_ERR(ctx);
-
-       resp.service_id   = ctx->cm_id->service_id;
-       resp.service_mask = ctx->cm_id->service_mask;
-       resp.local_id     = ctx->cm_id->local_id;
-       resp.remote_id    = ctx->cm_id->remote_id;
-
-       if (copy_to_user(u64_to_user_ptr(cmd.response),
-                        &resp, sizeof(resp)))
-               result = -EFAULT;
-
-       ib_ucm_ctx_put(ctx);
-       return result;
-}
-
-static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file,
-                                  const char __user *inbuf,
-                                  int in_len, int out_len)
-{
-       struct ib_uverbs_qp_attr resp;
-       struct ib_ucm_init_qp_attr cmd;
-       struct ib_ucm_context *ctx;
-       struct ib_qp_attr qp_attr;
-       int result = 0;
-
-       if (out_len < sizeof(resp))
-               return -ENOSPC;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (IS_ERR(ctx))
-               return PTR_ERR(ctx);
-
-       resp.qp_attr_mask = 0;
-       memset(&qp_attr, 0, sizeof qp_attr);
-       qp_attr.qp_state = cmd.qp_state;
-       result = ib_cm_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
-       if (result)
-               goto out;
-
-       ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr);
-
-       if (copy_to_user(u64_to_user_ptr(cmd.response),
-                        &resp, sizeof(resp)))
-               result = -EFAULT;
-
-out:
-       ib_ucm_ctx_put(ctx);
-       return result;
-}
-
-static int ucm_validate_listen(__be64 service_id, __be64 service_mask)
-{
-       service_id &= service_mask;
-
-       if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) ||
-           ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID))
-               return -EINVAL;
-
-       return 0;
-}
-
-static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
-                            const char __user *inbuf,
-                            int in_len, int out_len)
-{
-       struct ib_ucm_listen cmd;
-       struct ib_ucm_context *ctx;
-       int result;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (IS_ERR(ctx))
-               return PTR_ERR(ctx);
-
-       result = ucm_validate_listen(cmd.service_id, cmd.service_mask);
-       if (result)
-               goto out;
-
-       result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask);
-out:
-       ib_ucm_ctx_put(ctx);
-       return result;
-}
-
-static ssize_t ib_ucm_notify(struct ib_ucm_file *file,
-                            const char __user *inbuf,
-                            int in_len, int out_len)
-{
-       struct ib_ucm_notify cmd;
-       struct ib_ucm_context *ctx;
-       int result;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (IS_ERR(ctx))
-               return PTR_ERR(ctx);
-
-       result = ib_cm_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
-       ib_ucm_ctx_put(ctx);
-       return result;
-}
-
-static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len)
-{
-       void *data;
-
-       *dest = NULL;
-
-       if (!len)
-               return 0;
-
-       data = memdup_user(u64_to_user_ptr(src), len);
-       if (IS_ERR(data))
-               return PTR_ERR(data);
-
-       *dest = data;
-       return 0;
-}
-
-static int ib_ucm_path_get(struct sa_path_rec **path, u64 src)
-{
-       struct ib_user_path_rec upath;
-       struct sa_path_rec  *sa_path;
-
-       *path = NULL;
-
-       if (!src)
-               return 0;
-
-       sa_path = kmalloc(sizeof(*sa_path), GFP_KERNEL);
-       if (!sa_path)
-               return -ENOMEM;
-
-       if (copy_from_user(&upath, u64_to_user_ptr(src),
-                          sizeof(upath))) {
-
-               kfree(sa_path);
-               return -EFAULT;
-       }
-
-       ib_copy_path_rec_from_user(sa_path, &upath);
-       *path = sa_path;
-       return 0;
-}
-
-static ssize_t ib_ucm_send_req(struct ib_ucm_file *file,
-                              const char __user *inbuf,
-                              int in_len, int out_len)
-{
-       struct ib_cm_req_param param;
-       struct ib_ucm_context *ctx;
-       struct ib_ucm_req cmd;
-       int result;
-
-       param.private_data   = NULL;
-       param.primary_path   = NULL;
-       param.alternate_path = NULL;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
-       if (result)
-               goto done;
-
-       result = ib_ucm_path_get(&param.primary_path, cmd.primary_path);
-       if (result)
-               goto done;
-
-       result = ib_ucm_path_get(&param.alternate_path, cmd.alternate_path);
-       if (result)
-               goto done;
-
-       param.private_data_len           = cmd.len;
-       param.service_id                 = cmd.sid;
-       param.qp_num                     = cmd.qpn;
-       param.qp_type                    = cmd.qp_type;
-       param.starting_psn               = cmd.psn;
-       param.peer_to_peer               = cmd.peer_to_peer;
-       param.responder_resources        = cmd.responder_resources;
-       param.initiator_depth            = cmd.initiator_depth;
-       param.remote_cm_response_timeout = cmd.remote_cm_response_timeout;
-       param.flow_control               = cmd.flow_control;
-       param.local_cm_response_timeout  = cmd.local_cm_response_timeout;
-       param.retry_count                = cmd.retry_count;
-       param.rnr_retry_count            = cmd.rnr_retry_count;
-       param.max_cm_retries             = cmd.max_cm_retries;
-       param.srq                        = cmd.srq;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (!IS_ERR(ctx)) {
-               result = ib_send_cm_req(ctx->cm_id, &param);
-               ib_ucm_ctx_put(ctx);
-       } else
-               result = PTR_ERR(ctx);
-
-done:
-       kfree(param.private_data);
-       kfree(param.primary_path);
-       kfree(param.alternate_path);
-       return result;
-}
-
-static ssize_t ib_ucm_send_rep(struct ib_ucm_file *file,
-                              const char __user *inbuf,
-                              int in_len, int out_len)
-{
-       struct ib_cm_rep_param param;
-       struct ib_ucm_context *ctx;
-       struct ib_ucm_rep cmd;
-       int result;
-
-       param.private_data = NULL;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
-       if (result)
-               return result;
-
-       param.qp_num              = cmd.qpn;
-       param.starting_psn        = cmd.psn;
-       param.private_data_len    = cmd.len;
-       param.responder_resources = cmd.responder_resources;
-       param.initiator_depth     = cmd.initiator_depth;
-       param.failover_accepted   = cmd.failover_accepted;
-       param.flow_control        = cmd.flow_control;
-       param.rnr_retry_count     = cmd.rnr_retry_count;
-       param.srq                 = cmd.srq;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (!IS_ERR(ctx)) {
-               ctx->uid = cmd.uid;
-               result = ib_send_cm_rep(ctx->cm_id, &param);
-               ib_ucm_ctx_put(ctx);
-       } else
-               result = PTR_ERR(ctx);
-
-       kfree(param.private_data);
-       return result;
-}
-
-static ssize_t ib_ucm_send_private_data(struct ib_ucm_file *file,
-                                       const char __user *inbuf, int in_len,
-                                       int (*func)(struct ib_cm_id *cm_id,
-                                                   const void *private_data,
-                                                   u8 private_data_len))
-{
-       struct ib_ucm_private_data cmd;
-       struct ib_ucm_context *ctx;
-       const void *private_data = NULL;
-       int result;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       result = ib_ucm_alloc_data(&private_data, cmd.data, cmd.len);
-       if (result)
-               return result;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (!IS_ERR(ctx)) {
-               result = func(ctx->cm_id, private_data, cmd.len);
-               ib_ucm_ctx_put(ctx);
-       } else
-               result = PTR_ERR(ctx);
-
-       kfree(private_data);
-       return result;
-}
-
-static ssize_t ib_ucm_send_rtu(struct ib_ucm_file *file,
-                              const char __user *inbuf,
-                              int in_len, int out_len)
-{
-       return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_rtu);
-}
-
-static ssize_t ib_ucm_send_dreq(struct ib_ucm_file *file,
-                               const char __user *inbuf,
-                               int in_len, int out_len)
-{
-       return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_dreq);
-}
-
-static ssize_t ib_ucm_send_drep(struct ib_ucm_file *file,
-                               const char __user *inbuf,
-                               int in_len, int out_len)
-{
-       return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_drep);
-}
-
-static ssize_t ib_ucm_send_info(struct ib_ucm_file *file,
-                               const char __user *inbuf, int in_len,
-                               int (*func)(struct ib_cm_id *cm_id,
-                                           int status,
-                                           const void *info,
-                                           u8 info_len,
-                                           const void *data,
-                                           u8 data_len))
-{
-       struct ib_ucm_context *ctx;
-       struct ib_ucm_info cmd;
-       const void *data = NULL;
-       const void *info = NULL;
-       int result;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       result = ib_ucm_alloc_data(&data, cmd.data, cmd.data_len);
-       if (result)
-               goto done;
-
-       result = ib_ucm_alloc_data(&info, cmd.info, cmd.info_len);
-       if (result)
-               goto done;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (!IS_ERR(ctx)) {
-               result = func(ctx->cm_id, cmd.status, info, cmd.info_len,
-                             data, cmd.data_len);
-               ib_ucm_ctx_put(ctx);
-       } else
-               result = PTR_ERR(ctx);
-
-done:
-       kfree(data);
-       kfree(info);
-       return result;
-}
-
-static ssize_t ib_ucm_send_rej(struct ib_ucm_file *file,
-                              const char __user *inbuf,
-                              int in_len, int out_len)
-{
-       return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_rej);
-}
-
-static ssize_t ib_ucm_send_apr(struct ib_ucm_file *file,
-                              const char __user *inbuf,
-                              int in_len, int out_len)
-{
-       return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_apr);
-}
-
-static ssize_t ib_ucm_send_mra(struct ib_ucm_file *file,
-                              const char __user *inbuf,
-                              int in_len, int out_len)
-{
-       struct ib_ucm_context *ctx;
-       struct ib_ucm_mra cmd;
-       const void *data = NULL;
-       int result;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
-       if (result)
-               return result;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (!IS_ERR(ctx)) {
-               result = ib_send_cm_mra(ctx->cm_id, cmd.timeout, data, cmd.len);
-               ib_ucm_ctx_put(ctx);
-       } else
-               result = PTR_ERR(ctx);
-
-       kfree(data);
-       return result;
-}
-
-static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file,
-                              const char __user *inbuf,
-                              int in_len, int out_len)
-{
-       struct ib_ucm_context *ctx;
-       struct sa_path_rec *path = NULL;
-       struct ib_ucm_lap cmd;
-       const void *data = NULL;
-       int result;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
-       if (result)
-               goto done;
-
-       result = ib_ucm_path_get(&path, cmd.path);
-       if (result)
-               goto done;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (!IS_ERR(ctx)) {
-               result = ib_send_cm_lap(ctx->cm_id, path, data, cmd.len);
-               ib_ucm_ctx_put(ctx);
-       } else
-               result = PTR_ERR(ctx);
-
-done:
-       kfree(data);
-       kfree(path);
-       return result;
-}
-
-static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file,
-                                   const char __user *inbuf,
-                                   int in_len, int out_len)
-{
-       struct ib_cm_sidr_req_param param = {};
-       struct ib_ucm_context *ctx;
-       struct ib_ucm_sidr_req cmd;
-       int result;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
-       if (result)
-               goto done;
-
-       result = ib_ucm_path_get(&param.path, cmd.path);
-       if (result)
-               goto done;
-
-       param.private_data_len = cmd.len;
-       param.service_id       = cmd.sid;
-       param.timeout_ms       = cmd.timeout;
-       param.max_cm_retries   = cmd.max_cm_retries;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (!IS_ERR(ctx)) {
-               result = ib_send_cm_sidr_req(ctx->cm_id, &param);
-               ib_ucm_ctx_put(ctx);
-       } else
-               result = PTR_ERR(ctx);
-
-done:
-       kfree(param.private_data);
-       kfree(param.path);
-       return result;
-}
-
-static ssize_t ib_ucm_send_sidr_rep(struct ib_ucm_file *file,
-                                   const char __user *inbuf,
-                                   int in_len, int out_len)
-{
-       struct ib_cm_sidr_rep_param param;
-       struct ib_ucm_sidr_rep cmd;
-       struct ib_ucm_context *ctx;
-       int result;
-
-       param.info = NULL;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       result = ib_ucm_alloc_data(&param.private_data,
-                                  cmd.data, cmd.data_len);
-       if (result)
-               goto done;
-
-       result = ib_ucm_alloc_data(&param.info, cmd.info, cmd.info_len);
-       if (result)
-               goto done;
-
-       param.qp_num            = cmd.qpn;
-       param.qkey              = cmd.qkey;
-       param.status            = cmd.status;
-       param.info_length       = cmd.info_len;
-       param.private_data_len  = cmd.data_len;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (!IS_ERR(ctx)) {
-               result = ib_send_cm_sidr_rep(ctx->cm_id, &param);
-               ib_ucm_ctx_put(ctx);
-       } else
-               result = PTR_ERR(ctx);
-
-done:
-       kfree(param.private_data);
-       kfree(param.info);
-       return result;
-}
-
-static ssize_t (*ucm_cmd_table[])(struct ib_ucm_file *file,
-                                 const char __user *inbuf,
-                                 int in_len, int out_len) = {
-       [IB_USER_CM_CMD_CREATE_ID]     = ib_ucm_create_id,
-       [IB_USER_CM_CMD_DESTROY_ID]    = ib_ucm_destroy_id,
-       [IB_USER_CM_CMD_ATTR_ID]       = ib_ucm_attr_id,
-       [IB_USER_CM_CMD_LISTEN]        = ib_ucm_listen,
-       [IB_USER_CM_CMD_NOTIFY]        = ib_ucm_notify,
-       [IB_USER_CM_CMD_SEND_REQ]      = ib_ucm_send_req,
-       [IB_USER_CM_CMD_SEND_REP]      = ib_ucm_send_rep,
-       [IB_USER_CM_CMD_SEND_RTU]      = ib_ucm_send_rtu,
-       [IB_USER_CM_CMD_SEND_DREQ]     = ib_ucm_send_dreq,
-       [IB_USER_CM_CMD_SEND_DREP]     = ib_ucm_send_drep,
-       [IB_USER_CM_CMD_SEND_REJ]      = ib_ucm_send_rej,
-       [IB_USER_CM_CMD_SEND_MRA]      = ib_ucm_send_mra,
-       [IB_USER_CM_CMD_SEND_LAP]      = ib_ucm_send_lap,
-       [IB_USER_CM_CMD_SEND_APR]      = ib_ucm_send_apr,
-       [IB_USER_CM_CMD_SEND_SIDR_REQ] = ib_ucm_send_sidr_req,
-       [IB_USER_CM_CMD_SEND_SIDR_REP] = ib_ucm_send_sidr_rep,
-       [IB_USER_CM_CMD_EVENT]         = ib_ucm_event,
-       [IB_USER_CM_CMD_INIT_QP_ATTR]  = ib_ucm_init_qp_attr,
-};
-
-static ssize_t ib_ucm_write(struct file *filp, const char __user *buf,
-                           size_t len, loff_t *pos)
-{
-       struct ib_ucm_file *file = filp->private_data;
-       struct ib_ucm_cmd_hdr hdr;
-       ssize_t result;
-
-       if (!ib_safe_file_access(filp)) {
-               pr_err_once("ucm_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n",
-                           task_tgid_vnr(current), current->comm);
-               return -EACCES;
-       }
-
-       if (len < sizeof(hdr))
-               return -EINVAL;
-
-       if (copy_from_user(&hdr, buf, sizeof(hdr)))
-               return -EFAULT;
-
-       if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table))
-               return -EINVAL;
-       hdr.cmd = array_index_nospec(hdr.cmd, ARRAY_SIZE(ucm_cmd_table));
-
-       if (hdr.in + sizeof(hdr) > len)
-               return -EINVAL;
-
-       result = ucm_cmd_table[hdr.cmd](file, buf + sizeof(hdr),
-                                       hdr.in, hdr.out);
-       if (!result)
-               result = len;
-
-       return result;
-}
-
-static __poll_t ib_ucm_poll(struct file *filp,
-                               struct poll_table_struct *wait)
-{
-       struct ib_ucm_file *file = filp->private_data;
-       __poll_t mask = 0;
-
-       poll_wait(filp, &file->poll_wait, wait);
-
-       if (!list_empty(&file->events))
-               mask = EPOLLIN | EPOLLRDNORM;
-
-       return mask;
-}
-
-/*
- * ib_ucm_open() does not need the BKL:
- *
- *  - no global state is referred to;
- *  - there is no ioctl method to race against;
- *  - no further module initialization is required for open to work
- *    after the device is registered.
- */
-static int ib_ucm_open(struct inode *inode, struct file *filp)
-{
-       struct ib_ucm_file *file;
-
-       file = kmalloc(sizeof(*file), GFP_KERNEL);
-       if (!file)
-               return -ENOMEM;
-
-       INIT_LIST_HEAD(&file->events);
-       INIT_LIST_HEAD(&file->ctxs);
-       init_waitqueue_head(&file->poll_wait);
-
-       mutex_init(&file->file_mutex);
-
-       filp->private_data = file;
-       file->filp = filp;
-       file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev);
-
-       return stream_open(inode, filp);
-}
-
-static int ib_ucm_close(struct inode *inode, struct file *filp)
-{
-       struct ib_ucm_file *file = filp->private_data;
-       struct ib_ucm_context *ctx;
-
-       mutex_lock(&file->file_mutex);
-       while (!list_empty(&file->ctxs)) {
-               ctx = list_entry(file->ctxs.next,
-                                struct ib_ucm_context, file_list);
-               mutex_unlock(&file->file_mutex);
-
-               xa_erase(&ctx_id_table, ctx->id);
-               ib_destroy_cm_id(ctx->cm_id);
-               ib_ucm_cleanup_events(ctx);
-               kfree(ctx);
-
-               mutex_lock(&file->file_mutex);
-       }
-       mutex_unlock(&file->file_mutex);
-       kfree(file);
-       return 0;
-}
-
-static void ib_ucm_release_dev(struct device *dev)
-{
-       struct ib_ucm_device *ucm_dev;
-
-       ucm_dev = container_of(dev, struct ib_ucm_device, dev);
-       kfree(ucm_dev);
-}
-
-static void ib_ucm_free_dev(struct ib_ucm_device *ucm_dev)
-{
-       clear_bit(ucm_dev->devnum, dev_map);
-}
-
-static const struct file_operations ucm_fops = {
-       .owner   = THIS_MODULE,
-       .open    = ib_ucm_open,
-       .release = ib_ucm_close,
-       .write   = ib_ucm_write,
-       .poll    = ib_ucm_poll,
-       .llseek  = no_llseek,
-};
-
-static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
-                         char *buf)
-{
-       struct ib_ucm_device *ucm_dev;
-
-       ucm_dev = container_of(dev, struct ib_ucm_device, dev);
-       return sprintf(buf, "%s\n", ucm_dev->ib_dev->name);
-}
-static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
-
-static void ib_ucm_add_one(struct ib_device *device)
-{
-       int devnum;
-       dev_t base;
-       struct ib_ucm_device *ucm_dev;
-
-       if (!device->ops.alloc_ucontext || !rdma_cap_ib_cm(device, 1))
-               return;
-
-       ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
-       if (!ucm_dev)
-               return;
-
-       device_initialize(&ucm_dev->dev);
-       ucm_dev->ib_dev = device;
-       ucm_dev->dev.release = ib_ucm_release_dev;
-
-       devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
-       if (devnum >= IB_UCM_MAX_DEVICES)
-               goto err;
-       ucm_dev->devnum = devnum;
-       set_bit(devnum, dev_map);
-       if (devnum >= IB_UCM_NUM_FIXED_MINOR)
-               base = dynamic_ucm_dev + devnum - IB_UCM_NUM_FIXED_MINOR;
-       else
-               base = IB_UCM_BASE_DEV + devnum;
-
-       cdev_init(&ucm_dev->cdev, &ucm_fops);
-       ucm_dev->cdev.owner = THIS_MODULE;
-       kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum);
-
-       ucm_dev->dev.class = &cm_class;
-       ucm_dev->dev.parent = device->dev.parent;
-       ucm_dev->dev.devt = base;
-
-       dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum);
-       if (cdev_device_add(&ucm_dev->cdev, &ucm_dev->dev))
-               goto err_devnum;
-
-       if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev))
-               goto err_dev;
-
-       ib_set_client_data(device, &ucm_client, ucm_dev);
-       return;
-
-err_dev:
-       cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev);
-err_devnum:
-       ib_ucm_free_dev(ucm_dev);
-err:
-       put_device(&ucm_dev->dev);
-       return;
-}
-
-static void ib_ucm_remove_one(struct ib_device *device, void *client_data)
-{
-       struct ib_ucm_device *ucm_dev = client_data;
-
-       if (!ucm_dev)
-               return;
-
-       cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev);
-       ib_ucm_free_dev(ucm_dev);
-       put_device(&ucm_dev->dev);
-}
-
-static CLASS_ATTR_STRING(abi_version, S_IRUGO,
-                        __stringify(IB_USER_CM_ABI_VERSION));
-
-static int __init ib_ucm_init(void)
-{
-       int ret;
-
-       ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR,
-                                    "infiniband_cm");
-       if (ret) {
-               pr_err("ucm: couldn't register device number\n");
-               goto error1;
-       }
-
-       ret = alloc_chrdev_region(&dynamic_ucm_dev, 0, IB_UCM_NUM_DYNAMIC_MINOR,
-                                 "infiniband_cm");
-       if (ret) {
-               pr_err("ucm: couldn't register dynamic device number\n");
-               goto err_alloc;
-       }
-
-       ret = class_create_file(&cm_class, &class_attr_abi_version.attr);
-       if (ret) {
-               pr_err("ucm: couldn't create abi_version attribute\n");
-               goto error2;
-       }
-
-       ret = ib_register_client(&ucm_client);
-       if (ret) {
-               pr_err("ucm: couldn't register client\n");
-               goto error3;
-       }
-       return 0;
-
-error3:
-       class_remove_file(&cm_class, &class_attr_abi_version.attr);
-error2:
-       unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR);
-err_alloc:
-       unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR);
-error1:
-       return ret;
-}
-
-static void __exit ib_ucm_cleanup(void)
-{
-       ib_unregister_client(&ucm_client);
-       class_remove_file(&cm_class, &class_attr_abi_version.attr);
-       unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR);
-       unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR);
-       WARN_ON(!xa_empty(&ctx_id_table));
-}
-
-module_init(ib_ucm_init);
-module_exit(ib_ucm_cleanup);
index 140a338a135f5e46a281ace1919f9827d6116edd..0274e9b704be5930cedb52b2f9d036edec69b6bb 100644 (file)
@@ -52,6 +52,8 @@
 #include <rdma/rdma_cm_ib.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib.h>
+#include <rdma/rdma_netlink.h>
+#include "core_priv.h"
 
 MODULE_AUTHOR("Sean Hefty");
 MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
@@ -81,7 +83,7 @@ struct ucma_file {
 };
 
 struct ucma_context {
-       int                     id;
+       u32                     id;
        struct completion       comp;
        atomic_t                ref;
        int                     events_reported;
@@ -94,7 +96,7 @@ struct ucma_context {
        struct list_head        list;
        struct list_head        mc_list;
        /* mark that device is in process of destroying the internal HW
-        * resources, protected by the global mut
+        * resources, protected by the ctx_table lock
         */
        int                     closing;
        /* sync between removal event and id destroy, protected by file mut */
@@ -104,7 +106,7 @@ struct ucma_context {
 
 struct ucma_multicast {
        struct ucma_context     *ctx;
-       int                     id;
+       u32                     id;
        int                     events_reported;
 
        u64                     uid;
@@ -122,9 +124,8 @@ struct ucma_event {
        struct work_struct      close_work;
 };
 
-static DEFINE_MUTEX(mut);
-static DEFINE_IDR(ctx_idr);
-static DEFINE_IDR(multicast_idr);
+static DEFINE_XARRAY_ALLOC(ctx_table);
+static DEFINE_XARRAY_ALLOC(multicast_table);
 
 static const struct file_operations ucma_fops;
 
@@ -133,7 +134,7 @@ static inline struct ucma_context *_ucma_find_context(int id,
 {
        struct ucma_context *ctx;
 
-       ctx = idr_find(&ctx_idr, id);
+       ctx = xa_load(&ctx_table, id);
        if (!ctx)
                ctx = ERR_PTR(-ENOENT);
        else if (ctx->file != file || !ctx->cm_id)
@@ -145,7 +146,7 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
 {
        struct ucma_context *ctx;
 
-       mutex_lock(&mut);
+       xa_lock(&ctx_table);
        ctx = _ucma_find_context(id, file);
        if (!IS_ERR(ctx)) {
                if (ctx->closing)
@@ -153,7 +154,7 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
                else
                        atomic_inc(&ctx->ref);
        }
-       mutex_unlock(&mut);
+       xa_unlock(&ctx_table);
        return ctx;
 }
 
@@ -216,10 +217,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
        INIT_LIST_HEAD(&ctx->mc_list);
        ctx->file = file;
 
-       mutex_lock(&mut);
-       ctx->id = idr_alloc(&ctx_idr, ctx, 0, 0, GFP_KERNEL);
-       mutex_unlock(&mut);
-       if (ctx->id < 0)
+       if (xa_alloc(&ctx_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL))
                goto error;
 
        list_add_tail(&ctx->list, &file->ctx_list);
@@ -238,13 +236,10 @@ static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx)
        if (!mc)
                return NULL;
 
-       mutex_lock(&mut);
-       mc->id = idr_alloc(&multicast_idr, NULL, 0, 0, GFP_KERNEL);
-       mutex_unlock(&mut);
-       if (mc->id < 0)
+       mc->ctx = ctx;
+       if (xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, GFP_KERNEL))
                goto error;
 
-       mc->ctx = ctx;
        list_add_tail(&mc->list, &ctx->mc_list);
        return mc;
 
@@ -319,9 +314,9 @@ static void ucma_removal_event_handler(struct rdma_cm_id *cm_id)
         * handled separately below.
         */
        if (ctx->cm_id == cm_id) {
-               mutex_lock(&mut);
+               xa_lock(&ctx_table);
                ctx->closing = 1;
-               mutex_unlock(&mut);
+               xa_unlock(&ctx_table);
                queue_work(ctx->file->close_wq, &ctx->close_work);
                return;
        }
@@ -523,9 +518,7 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf,
 err2:
        rdma_destroy_id(cm_id);
 err1:
-       mutex_lock(&mut);
-       idr_remove(&ctx_idr, ctx->id);
-       mutex_unlock(&mut);
+       xa_erase(&ctx_table, ctx->id);
        mutex_lock(&file->mut);
        list_del(&ctx->list);
        mutex_unlock(&file->mut);
@@ -537,13 +530,13 @@ static void ucma_cleanup_multicast(struct ucma_context *ctx)
 {
        struct ucma_multicast *mc, *tmp;
 
-       mutex_lock(&mut);
+       mutex_lock(&ctx->file->mut);
        list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) {
                list_del(&mc->list);
-               idr_remove(&multicast_idr, mc->id);
+               xa_erase(&multicast_table, mc->id);
                kfree(mc);
        }
-       mutex_unlock(&mut);
+       mutex_unlock(&ctx->file->mut);
 }
 
 static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
@@ -614,11 +607,11 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
        if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
                return -EFAULT;
 
-       mutex_lock(&mut);
+       xa_lock(&ctx_table);
        ctx = _ucma_find_context(cmd.id, file);
        if (!IS_ERR(ctx))
-               idr_remove(&ctx_idr, ctx->id);
-       mutex_unlock(&mut);
+               __xa_erase(&ctx_table, ctx->id);
+       xa_unlock(&ctx_table);
 
        if (IS_ERR(ctx))
                return PTR_ERR(ctx);
@@ -630,14 +623,14 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
        flush_workqueue(ctx->file->close_wq);
        /* At this point it's guaranteed that there is no inflight
         * closing task */
-       mutex_lock(&mut);
+       xa_lock(&ctx_table);
        if (!ctx->closing) {
-               mutex_unlock(&mut);
+               xa_unlock(&ctx_table);
                ucma_put_ctx(ctx);
                wait_for_completion(&ctx->comp);
                rdma_destroy_id(ctx->cm_id);
        } else {
-               mutex_unlock(&mut);
+               xa_unlock(&ctx_table);
        }
 
        resp.events_reported = ucma_free_ctx(ctx);
@@ -951,8 +944,7 @@ static ssize_t ucma_query_path(struct ucma_context *ctx,
                }
        }
 
-       if (copy_to_user(response, resp,
-                        sizeof(*resp) + (i * sizeof(struct ib_path_rec_data))))
+       if (copy_to_user(response, resp, struct_size(resp, path_data, i)))
                ret = -EFAULT;
 
        kfree(resp);
@@ -1432,9 +1424,7 @@ static ssize_t ucma_process_join(struct ucma_file *file,
                goto err3;
        }
 
-       mutex_lock(&mut);
-       idr_replace(&multicast_idr, mc, mc->id);
-       mutex_unlock(&mut);
+       xa_store(&multicast_table, mc->id, mc, 0);
 
        mutex_unlock(&file->mut);
        ucma_put_ctx(ctx);
@@ -1444,9 +1434,7 @@ err3:
        rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr);
        ucma_cleanup_mc_events(mc);
 err2:
-       mutex_lock(&mut);
-       idr_remove(&multicast_idr, mc->id);
-       mutex_unlock(&mut);
+       xa_erase(&multicast_table, mc->id);
        list_del(&mc->list);
        kfree(mc);
 err1:
@@ -1508,8 +1496,8 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file,
        if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
                return -EFAULT;
 
-       mutex_lock(&mut);
-       mc = idr_find(&multicast_idr, cmd.id);
+       xa_lock(&multicast_table);
+       mc = xa_load(&multicast_table, cmd.id);
        if (!mc)
                mc = ERR_PTR(-ENOENT);
        else if (mc->ctx->file != file)
@@ -1517,8 +1505,8 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file,
        else if (!atomic_inc_not_zero(&mc->ctx->ref))
                mc = ERR_PTR(-ENXIO);
        else
-               idr_remove(&multicast_idr, mc->id);
-       mutex_unlock(&mut);
+               __xa_erase(&multicast_table, mc->id);
+       xa_unlock(&multicast_table);
 
        if (IS_ERR(mc)) {
                ret = PTR_ERR(mc);
@@ -1615,14 +1603,14 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file,
         * events being added before existing events.
         */
        ucma_lock_files(cur_file, new_file);
-       mutex_lock(&mut);
+       xa_lock(&ctx_table);
 
        list_move_tail(&ctx->list, &new_file->ctx_list);
        ucma_move_events(ctx, new_file);
        ctx->file = new_file;
        resp.events_reported = ctx->events_reported;
 
-       mutex_unlock(&mut);
+       xa_unlock(&ctx_table);
        ucma_unlock_files(cur_file, new_file);
 
 response:
@@ -1757,18 +1745,15 @@ static int ucma_close(struct inode *inode, struct file *filp)
                ctx->destroying = 1;
                mutex_unlock(&file->mut);
 
-               mutex_lock(&mut);
-               idr_remove(&ctx_idr, ctx->id);
-               mutex_unlock(&mut);
-
+               xa_erase(&ctx_table, ctx->id);
                flush_workqueue(file->close_wq);
                /* At that step once ctx was marked as destroying and workqueue
                 * was flushed we are safe from any inflights handlers that
                 * might put other closing task.
                 */
-               mutex_lock(&mut);
+               xa_lock(&ctx_table);
                if (!ctx->closing) {
-                       mutex_unlock(&mut);
+                       xa_unlock(&ctx_table);
                        ucma_put_ctx(ctx);
                        wait_for_completion(&ctx->comp);
                        /* rdma_destroy_id ensures that no event handlers are
@@ -1776,7 +1761,7 @@ static int ucma_close(struct inode *inode, struct file *filp)
                         */
                        rdma_destroy_id(ctx->cm_id);
                } else {
-                       mutex_unlock(&mut);
+                       xa_unlock(&ctx_table);
                }
 
                ucma_free_ctx(ctx);
@@ -1805,6 +1790,19 @@ static struct miscdevice ucma_misc = {
        .fops           = &ucma_fops,
 };
 
+static int ucma_get_global_nl_info(struct ib_client_nl_info *res)
+{
+       res->abi = RDMA_USER_CM_ABI_VERSION;
+       res->cdev = ucma_misc.this_device;
+       return 0;
+}
+
+static struct ib_client rdma_cma_client = {
+       .name = "rdma_cm",
+       .get_global_nl_info = ucma_get_global_nl_info,
+};
+MODULE_ALIAS_RDMA_CLIENT("rdma_cm");
+
 static ssize_t show_abi_version(struct device *dev,
                                struct device_attribute *attr,
                                char *buf)
@@ -1833,7 +1831,14 @@ static int __init ucma_init(void)
                ret = -ENOMEM;
                goto err2;
        }
+
+       ret = ib_register_client(&rdma_cma_client);
+       if (ret)
+               goto err3;
+
        return 0;
+err3:
+       unregister_net_sysctl_table(ucma_ctl_table_hdr);
 err2:
        device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
 err1:
@@ -1843,11 +1848,10 @@ err1:
 
 static void __exit ucma_cleanup(void)
 {
+       ib_unregister_client(&rdma_cma_client);
        unregister_net_sysctl_table(ucma_ctl_table_hdr);
        device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
        misc_deregister(&ucma_misc);
-       idr_destroy(&ctx_idr);
-       idr_destroy(&multicast_idr);
 }
 
 module_init(ucma_init);
index e7ea819fcb116dd8bb45a34160af950f4b87316d..08da840ed7eebc151b67a61b15626d90578dea54 100644 (file)
@@ -54,9 +54,10 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
 
        for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
                page = sg_page_iter_page(&sg_iter);
-               if (!PageDirty(page) && umem->writable && dirty)
-                       set_page_dirty_lock(page);
-               put_page(page);
+               if (umem->writable && dirty)
+                       put_user_pages_dirty_lock(&page, 1);
+               else
+                       put_user_page(page);
        }
 
        sg_free_table(&umem->sg_head);
@@ -244,7 +245,6 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
        umem->context    = context;
        umem->length     = size;
        umem->address    = addr;
-       umem->page_shift = PAGE_SHIFT;
        umem->writable   = ib_access_writable(access);
        umem->owning_mm = mm = current->mm;
        mmgrab(mm);
@@ -361,6 +361,9 @@ static void __ib_umem_release_tail(struct ib_umem *umem)
  */
 void ib_umem_release(struct ib_umem *umem)
 {
+       if (!umem)
+               return;
+
        if (umem->is_odp) {
                ib_umem_odp_release(to_ib_umem_odp(umem));
                __ib_umem_release_tail(umem);
@@ -385,7 +388,7 @@ int ib_umem_page_count(struct ib_umem *umem)
 
        n = 0;
        for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
-               n += sg_dma_len(sg) >> umem->page_shift;
+               n += sg_dma_len(sg) >> PAGE_SHIFT;
 
        return n;
 }
index f962b5bbfa40e4268de37f99c78b6653895c9599..2a75c6f8d8270e5ca07610684fbe9e5156f72708 100644 (file)
@@ -59,7 +59,7 @@ static u64 node_start(struct umem_odp_node *n)
        struct ib_umem_odp *umem_odp =
                        container_of(n, struct ib_umem_odp, interval_tree);
 
-       return ib_umem_start(&umem_odp->umem);
+       return ib_umem_start(umem_odp);
 }
 
 /* Note that the representation of the intervals in the interval tree
@@ -72,7 +72,7 @@ static u64 node_last(struct umem_odp_node *n)
        struct ib_umem_odp *umem_odp =
                        container_of(n, struct ib_umem_odp, interval_tree);
 
-       return ib_umem_end(&umem_odp->umem) - 1;
+       return ib_umem_end(umem_odp) - 1;
 }
 
 INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
@@ -107,8 +107,6 @@ static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
 static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp,
                                               u64 start, u64 end, void *cookie)
 {
-       struct ib_umem *umem = &umem_odp->umem;
-
        /*
         * Increase the number of notifiers running, to
         * prevent any further fault handling on this MR.
@@ -119,8 +117,8 @@ static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp,
         * all pending page faults. */
        smp_wmb();
        complete_all(&umem_odp->notifier_completion);
-       umem->context->invalidate_range(umem_odp, ib_umem_start(umem),
-                                       ib_umem_end(umem));
+       umem_odp->umem.context->invalidate_range(
+               umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp));
        return 0;
 }
 
@@ -151,6 +149,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
 {
        struct ib_ucontext_per_mm *per_mm =
                container_of(mn, struct ib_ucontext_per_mm, mn);
+       int rc;
 
        if (mmu_notifier_range_blockable(range))
                down_read(&per_mm->umem_rwsem);
@@ -167,11 +166,14 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
                return 0;
        }
 
-       return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
-                                            range->end,
-                                            invalidate_range_start_trampoline,
-                                            mmu_notifier_range_blockable(range),
-                                            NULL);
+       rc = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
+                                          range->end,
+                                          invalidate_range_start_trampoline,
+                                          mmu_notifier_range_blockable(range),
+                                          NULL);
+       if (rc)
+               up_read(&per_mm->umem_rwsem);
+       return rc;
 }
 
 static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
@@ -205,10 +207,9 @@ static const struct mmu_notifier_ops ib_umem_notifiers = {
 static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp)
 {
        struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
-       struct ib_umem *umem = &umem_odp->umem;
 
        down_write(&per_mm->umem_rwsem);
-       if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+       if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp)))
                rbt_ib_umem_insert(&umem_odp->interval_tree,
                                   &per_mm->umem_tree);
        up_write(&per_mm->umem_rwsem);
@@ -217,10 +218,9 @@ static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp)
 static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp)
 {
        struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
-       struct ib_umem *umem = &umem_odp->umem;
 
        down_write(&per_mm->umem_rwsem);
-       if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+       if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp)))
                rbt_ib_umem_remove(&umem_odp->interval_tree,
                                   &per_mm->umem_tree);
        complete_all(&umem_odp->notifier_completion);
@@ -351,7 +351,7 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root,
        umem->context    = ctx;
        umem->length     = size;
        umem->address    = addr;
-       umem->page_shift = PAGE_SHIFT;
+       odp_data->page_shift = PAGE_SHIFT;
        umem->writable   = root->umem.writable;
        umem->is_odp = 1;
        odp_data->per_mm = per_mm;
@@ -405,18 +405,19 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access)
        struct mm_struct *mm = umem->owning_mm;
        int ret_val;
 
+       umem_odp->page_shift = PAGE_SHIFT;
        if (access & IB_ACCESS_HUGETLB) {
                struct vm_area_struct *vma;
                struct hstate *h;
 
                down_read(&mm->mmap_sem);
-               vma = find_vma(mm, ib_umem_start(umem));
+               vma = find_vma(mm, ib_umem_start(umem_odp));
                if (!vma || !is_vm_hugetlb_page(vma)) {
                        up_read(&mm->mmap_sem);
                        return -EINVAL;
                }
                h = hstate_vma(vma);
-               umem->page_shift = huge_page_shift(h);
+               umem_odp->page_shift = huge_page_shift(h);
                up_read(&mm->mmap_sem);
        }
 
@@ -424,16 +425,16 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access)
 
        init_completion(&umem_odp->notifier_completion);
 
-       if (ib_umem_num_pages(umem)) {
+       if (ib_umem_odp_num_pages(umem_odp)) {
                umem_odp->page_list =
                        vzalloc(array_size(sizeof(*umem_odp->page_list),
-                                          ib_umem_num_pages(umem)));
+                                          ib_umem_odp_num_pages(umem_odp)));
                if (!umem_odp->page_list)
                        return -ENOMEM;
 
                umem_odp->dma_list =
                        vzalloc(array_size(sizeof(*umem_odp->dma_list),
-                                          ib_umem_num_pages(umem)));
+                                          ib_umem_odp_num_pages(umem_odp)));
                if (!umem_odp->dma_list) {
                        ret_val = -ENOMEM;
                        goto out_page_list;
@@ -456,16 +457,14 @@ out_page_list:
 
 void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
 {
-       struct ib_umem *umem = &umem_odp->umem;
-
        /*
         * Ensure that no more pages are mapped in the umem.
         *
         * It is the driver's responsibility to ensure, before calling us,
         * that the hardware will not attempt to access the MR any more.
         */
-       ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem),
-                                   ib_umem_end(umem));
+       ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+                                   ib_umem_end(umem_odp));
 
        remove_umem_from_per_mm(umem_odp);
        put_per_mm(umem_odp);
@@ -487,7 +486,7 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
  * The function returns -EFAULT if the DMA mapping operation fails. It returns
  * -EAGAIN if a concurrent invalidation prevents us from updating the page.
  *
- * The page is released via put_page even if the operation failed. For
+ * The page is released via put_user_page even if the operation failed. For
  * on-demand pinning, the page is released whenever it isn't stored in the
  * umem.
  */
@@ -498,8 +497,8 @@ static int ib_umem_odp_map_dma_single_page(
                u64 access_mask,
                unsigned long current_seq)
 {
-       struct ib_umem *umem = &umem_odp->umem;
-       struct ib_device *dev = umem->context->device;
+       struct ib_ucontext *context = umem_odp->umem.context;
+       struct ib_device *dev = context->device;
        dma_addr_t dma_addr;
        int remove_existing_mapping = 0;
        int ret = 0;
@@ -514,10 +513,9 @@ static int ib_umem_odp_map_dma_single_page(
                goto out;
        }
        if (!(umem_odp->dma_list[page_index])) {
-               dma_addr = ib_dma_map_page(dev,
-                                          page,
-                                          0, BIT(umem->page_shift),
-                                          DMA_BIDIRECTIONAL);
+               dma_addr =
+                       ib_dma_map_page(dev, page, 0, BIT(umem_odp->page_shift),
+                                       DMA_BIDIRECTIONAL);
                if (ib_dma_mapping_error(dev, dma_addr)) {
                        ret = -EFAULT;
                        goto out;
@@ -536,15 +534,16 @@ static int ib_umem_odp_map_dma_single_page(
        }
 
 out:
-       put_page(page);
+       put_user_page(page);
 
        if (remove_existing_mapping) {
                ib_umem_notifier_start_account(umem_odp);
-               umem->context->invalidate_range(
+               context->invalidate_range(
                        umem_odp,
-                       ib_umem_start(umem) + (page_index << umem->page_shift),
-                       ib_umem_start(umem) +
-                               ((page_index + 1) << umem->page_shift));
+                       ib_umem_start(umem_odp) +
+                               (page_index << umem_odp->page_shift),
+                       ib_umem_start(umem_odp) +
+                               ((page_index + 1) << umem_odp->page_shift));
                ib_umem_notifier_end_account(umem_odp);
                ret = -EAGAIN;
        }
@@ -581,27 +580,26 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
                              u64 bcnt, u64 access_mask,
                              unsigned long current_seq)
 {
-       struct ib_umem *umem = &umem_odp->umem;
        struct task_struct *owning_process  = NULL;
        struct mm_struct *owning_mm = umem_odp->umem.owning_mm;
        struct page       **local_page_list = NULL;
        u64 page_mask, off;
-       int j, k, ret = 0, start_idx, npages = 0, page_shift;
-       unsigned int flags = 0;
+       int j, k, ret = 0, start_idx, npages = 0;
+       unsigned int flags = 0, page_shift;
        phys_addr_t p = 0;
 
        if (access_mask == 0)
                return -EINVAL;
 
-       if (user_virt < ib_umem_start(umem) ||
-           user_virt + bcnt > ib_umem_end(umem))
+       if (user_virt < ib_umem_start(umem_odp) ||
+           user_virt + bcnt > ib_umem_end(umem_odp))
                return -EFAULT;
 
        local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
        if (!local_page_list)
                return -ENOMEM;
 
-       page_shift = umem->page_shift;
+       page_shift = umem_odp->page_shift;
        page_mask = ~(BIT(page_shift) - 1);
        off = user_virt & (~page_mask);
        user_virt = user_virt & page_mask;
@@ -621,7 +619,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
        if (access_mask & ODP_WRITE_ALLOWED_BIT)
                flags |= FOLL_WRITE;
 
-       start_idx = (user_virt - ib_umem_start(umem)) >> page_shift;
+       start_idx = (user_virt - ib_umem_start(umem_odp)) >> page_shift;
        k = start_idx;
 
        while (bcnt > 0) {
@@ -659,7 +657,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
                                        ret = -EFAULT;
                                        break;
                                }
-                               put_page(local_page_list[j]);
+                               put_user_page(local_page_list[j]);
                                continue;
                        }
 
@@ -686,8 +684,8 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
                         * ib_umem_odp_map_dma_single_page().
                         */
                        if (npages - (j + 1) > 0)
-                               release_pages(&local_page_list[j+1],
-                                             npages - (j + 1));
+                               put_user_pages(&local_page_list[j+1],
+                                              npages - (j + 1));
                        break;
                }
        }
@@ -711,21 +709,20 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
 void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
                                 u64 bound)
 {
-       struct ib_umem *umem = &umem_odp->umem;
        int idx;
        u64 addr;
-       struct ib_device *dev = umem->context->device;
+       struct ib_device *dev = umem_odp->umem.context->device;
 
-       virt  = max_t(u64, virt,  ib_umem_start(umem));
-       bound = min_t(u64, bound, ib_umem_end(umem));
+       virt = max_t(u64, virt, ib_umem_start(umem_odp));
+       bound = min_t(u64, bound, ib_umem_end(umem_odp));
        /* Note that during the run of this function, the
         * notifiers_count of the MR is > 0, preventing any racing
         * faults from completion. We might be racing with other
         * invalidations, so we must make sure we free each page only
         * once. */
        mutex_lock(&umem_odp->umem_mutex);
-       for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) {
-               idx = (addr - ib_umem_start(umem)) >> umem->page_shift;
+       for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
+               idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
                if (umem_odp->page_list[idx]) {
                        struct page *page = umem_odp->page_list[idx];
                        dma_addr_t dma = umem_odp->dma_list[idx];
@@ -733,7 +730,8 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
 
                        WARN_ON(!dma_addr);
 
-                       ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
+                       ib_dma_unmap_page(dev, dma_addr,
+                                         BIT(umem_odp->page_shift),
                                          DMA_BIDIRECTIONAL);
                        if (dma & ODP_WRITE_ALLOWED_BIT) {
                                struct page *head_page = compound_head(page);
index 671f07ba1fad66e8300d93c9b85a65091bcbc9ae..9f8a48016b4152a248f781d69760ce4fa433897c 100644 (file)
@@ -54,6 +54,7 @@
 
 #include <rdma/ib_mad.h>
 #include <rdma/ib_user_mad.h>
+#include <rdma/rdma_netlink.h>
 
 #include "core_priv.h"
 
@@ -744,7 +745,7 @@ found:
                                "process %s did not enable P_Key index support.\n",
                                current->comm);
                        dev_warn(&file->port->dev,
-                               "   Documentation/infiniband/user_mad.txt has info on the new ABI.\n");
+                               "   Documentation/infiniband/user_mad.rst has info on the new ABI.\n");
                }
        }
 
@@ -1124,11 +1125,48 @@ static const struct file_operations umad_sm_fops = {
        .llseek  = no_llseek,
 };
 
+static int ib_umad_get_nl_info(struct ib_device *ibdev, void *client_data,
+                              struct ib_client_nl_info *res)
+{
+       struct ib_umad_device *umad_dev = client_data;
+
+       if (!rdma_is_port_valid(ibdev, res->port))
+               return -EINVAL;
+
+       res->abi = IB_USER_MAD_ABI_VERSION;
+       res->cdev = &umad_dev->ports[res->port - rdma_start_port(ibdev)].dev;
+
+       return 0;
+}
+
 static struct ib_client umad_client = {
        .name   = "umad",
        .add    = ib_umad_add_one,
-       .remove = ib_umad_remove_one
+       .remove = ib_umad_remove_one,
+       .get_nl_info = ib_umad_get_nl_info,
 };
+MODULE_ALIAS_RDMA_CLIENT("umad");
+
+static int ib_issm_get_nl_info(struct ib_device *ibdev, void *client_data,
+                              struct ib_client_nl_info *res)
+{
+       struct ib_umad_device *umad_dev =
+               ib_get_client_data(ibdev, &umad_client);
+
+       if (!rdma_is_port_valid(ibdev, res->port))
+               return -EINVAL;
+
+       res->abi = IB_USER_MAD_ABI_VERSION;
+       res->cdev = &umad_dev->ports[res->port - rdma_start_port(ibdev)].sm_dev;
+
+       return 0;
+}
+
+static struct ib_client issm_client = {
+       .name = "issm",
+       .get_nl_info = ib_issm_get_nl_info,
+};
+MODULE_ALIAS_RDMA_CLIENT("issm");
 
 static ssize_t ibdev_show(struct device *dev, struct device_attribute *attr,
                          char *buf)
@@ -1387,13 +1425,17 @@ static int __init ib_umad_init(void)
        }
 
        ret = ib_register_client(&umad_client);
-       if (ret) {
-               pr_err("couldn't register ib_umad client\n");
+       if (ret)
                goto out_class;
-       }
+
+       ret = ib_register_client(&issm_client);
+       if (ret)
+               goto out_client;
 
        return 0;
 
+out_client:
+       ib_unregister_client(&umad_client);
 out_class:
        class_unregister(&umad_class);
 
@@ -1411,6 +1453,7 @@ out:
 
 static void __exit ib_umad_cleanup(void)
 {
+       ib_unregister_client(&issm_client);
        ib_unregister_client(&umad_client);
        class_unregister(&umad_class);
        unregister_chrdev_region(base_umad_dev,
index 63fe14c7c68fc31586fad4cd3559a52d03d62ea0..7ddd0e5bc6b3419f79217a59030037c7b5f5b14c 100644 (file)
@@ -756,7 +756,9 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
 
        mr->device  = pd->device;
        mr->pd      = pd;
+       mr->type    = IB_MR_TYPE_USER;
        mr->dm      = NULL;
+       mr->sig_attrs = NULL;
        mr->uobject = uobj;
        atomic_inc(&pd->usecnt);
        mr->res.type = RDMA_RESTRACK_MR;
@@ -1021,12 +1023,11 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
        attr.comp_vector = cmd->comp_vector;
        attr.flags = cmd->flags;
 
-       cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata);
-       if (IS_ERR(cq)) {
-               ret = PTR_ERR(cq);
+       cq = rdma_zalloc_drv_obj(ib_dev, ib_cq);
+       if (!cq) {
+               ret = -ENOMEM;
                goto err_file;
        }
-
        cq->device        = ib_dev;
        cq->uobject       = &obj->uobject;
        cq->comp_handler  = ib_uverbs_comp_handler;
@@ -1034,6 +1035,10 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
        cq->cq_context    = ev_file ? &ev_file->ev_queue : NULL;
        atomic_set(&cq->usecnt, 0);
 
+       ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata);
+       if (ret)
+               goto err_free;
+
        obj->uobject.object = cq;
        memset(&resp, 0, sizeof resp);
        resp.base.cq_handle = obj->uobject.id;
@@ -1054,7 +1059,9 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
 
 err_cb:
        ib_destroy_cq_user(cq, uverbs_get_cleared_udata(attrs));
-
+       cq = NULL;
+err_free:
+       kfree(cq);
 err_file:
        if (ev_file)
                ib_uverbs_release_ucq(attrs->ufile, ev_file, obj);
@@ -2541,7 +2548,7 @@ static int ib_uverbs_detach_mcast(struct uverbs_attr_bundle *attrs)
        struct ib_uqp_object         *obj;
        struct ib_qp                 *qp;
        struct ib_uverbs_mcast_entry *mcast;
-       int                           ret = -EINVAL;
+       int                           ret;
        bool                          found = false;
 
        ret = uverbs_request(attrs, &cmd, sizeof(cmd));
@@ -3715,9 +3722,6 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs)
  * trailing driver_data flex array. In this case the size of the base struct
  * cannot be changed.
  */
-#define offsetof_after(_struct, _member)                                       \
-       (offsetof(_struct, _member) + sizeof(((_struct *)NULL)->_member))
-
 #define UAPI_DEF_WRITE_IO(req, resp)                                           \
        .write.has_resp = 1 +                                                  \
                          BUILD_BUG_ON_ZERO(offsetof(req, response) != 0) +    \
@@ -3748,11 +3752,11 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs)
  */
 #define UAPI_DEF_WRITE_IO_EX(req, req_last_member, resp, resp_last_member)     \
        .write.has_resp = 1,                                                   \
-       .write.req_size = offsetof_after(req, req_last_member),                \
-       .write.resp_size = offsetof_after(resp, resp_last_member)
+       .write.req_size = offsetofend(req, req_last_member),                   \
+       .write.resp_size = offsetofend(resp, resp_last_member)
 
 #define UAPI_DEF_WRITE_I_EX(req, req_last_member)                              \
-       .write.req_size = offsetof_after(req, req_last_member)
+       .write.req_size = offsetofend(req, req_last_member)
 
 const struct uapi_definition uverbs_def_write_intf[] = {
        DECLARE_UVERBS_OBJECT(
index 84a5e9a6d483e8933502f76e1004267fef509063..11c13c1381cf5c9d6afdb4596c36f8d4a7203c8b 100644 (file)
@@ -51,6 +51,7 @@
 
 #include <rdma/ib.h>
 #include <rdma/uverbs_std_types.h>
+#include <rdma/rdma_netlink.h>
 
 #include "uverbs.h"
 #include "core_priv.h"
@@ -198,7 +199,7 @@ void ib_uverbs_release_file(struct kref *ref)
        ib_dev = srcu_dereference(file->device->ib_dev,
                                  &file->device->disassociate_srcu);
        if (ib_dev && !ib_dev->ops.disassociate_ucontext)
-               module_put(ib_dev->owner);
+               module_put(ib_dev->ops.owner);
        srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
 
        if (atomic_dec_and_test(&file->device->refcount))
@@ -1065,7 +1066,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
        module_dependent = !(ib_dev->ops.disassociate_ucontext);
 
        if (module_dependent) {
-               if (!try_module_get(ib_dev->owner)) {
+               if (!try_module_get(ib_dev->ops.owner)) {
                        ret = -ENODEV;
                        goto err;
                }
@@ -1100,7 +1101,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
        return stream_open(inode, filp);
 
 err_module:
-       module_put(ib_dev->owner);
+       module_put(ib_dev->ops.owner);
 
 err:
        mutex_unlock(&dev->lists_mutex);
@@ -1148,12 +1149,41 @@ static const struct file_operations uverbs_mmap_fops = {
        .compat_ioctl = ib_uverbs_ioctl,
 };
 
+static int ib_uverbs_get_nl_info(struct ib_device *ibdev, void *client_data,
+                                struct ib_client_nl_info *res)
+{
+       struct ib_uverbs_device *uverbs_dev = client_data;
+       int ret;
+
+       if (res->port != -1)
+               return -EINVAL;
+
+       res->abi = ibdev->ops.uverbs_abi_ver;
+       res->cdev = &uverbs_dev->dev;
+
+       /*
+        * To support DRIVER_ID binding in userspace some of the driver need
+        * upgrading to expose their PCI dependent revision information
+        * through get_context instead of relying on modalias matching. When
+        * the drivers are fixed they can drop this flag.
+        */
+       if (!ibdev->ops.uverbs_no_driver_id_binding) {
+               ret = nla_put_u32(res->nl_msg, RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID,
+                                 ibdev->ops.driver_id);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
 static struct ib_client uverbs_client = {
        .name   = "uverbs",
        .no_kverbs_req = true,
        .add    = ib_uverbs_add_one,
-       .remove = ib_uverbs_remove_one
+       .remove = ib_uverbs_remove_one,
+       .get_nl_info = ib_uverbs_get_nl_info,
 };
+MODULE_ALIAS_RDMA_CLIENT("uverbs");
 
 static ssize_t ibdev_show(struct device *device, struct device_attribute *attr,
                          char *buf)
@@ -1186,7 +1216,7 @@ static ssize_t abi_version_show(struct device *device,
        srcu_key = srcu_read_lock(&dev->disassociate_srcu);
        ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
        if (ib_dev)
-               ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver);
+               ret = sprintf(buf, "%u\n", ib_dev->ops.uverbs_abi_ver);
        srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
 
        return ret;
index 07ea4e3c45663a5fef7d889e986ddd3918a0efe4..e39fe6a8aac43382eb016e82f91bd6f35e4d7986 100644 (file)
@@ -111,9 +111,9 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
        INIT_LIST_HEAD(&obj->comp_list);
        INIT_LIST_HEAD(&obj->async_list);
 
-       cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata);
-       if (IS_ERR(cq)) {
-               ret = PTR_ERR(cq);
+       cq = rdma_zalloc_drv_obj(ib_dev, ib_cq);
+       if (!cq) {
+               ret = -ENOMEM;
                goto err_event_file;
        }
 
@@ -122,10 +122,15 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
        cq->comp_handler  = ib_uverbs_comp_handler;
        cq->event_handler = ib_uverbs_cq_event_handler;
        cq->cq_context    = ev_file ? &ev_file->ev_queue : NULL;
-       obj->uobject.object = cq;
-       obj->uobject.user_handle = user_handle;
        atomic_set(&cq->usecnt, 0);
        cq->res.type = RDMA_RESTRACK_CQ;
+
+       ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata);
+       if (ret)
+               goto err_free;
+
+       obj->uobject.object = cq;
+       obj->uobject.user_handle = user_handle;
        rdma_restrack_uadd(&cq->res);
 
        ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe,
@@ -136,7 +141,9 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
        return 0;
 err_cq:
        ib_destroy_cq_user(cq, uverbs_get_cleared_udata(attrs));
-
+       cq = NULL;
+err_free:
+       kfree(cq);
 err_event_file:
        if (ev_file)
                uverbs_uobject_put(ev_file_uobj);
index 997f7a3a558af96c839ca3b74aacc9767bd5fcf8..c1286a52dc8451d4a81eb918c1f0f0482630e732 100644 (file)
@@ -128,6 +128,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(
 
        mr->device  = pd->device;
        mr->pd      = pd;
+       mr->type    = IB_MR_TYPE_DM;
        mr->dm      = dm;
        mr->uobject = uobj;
        atomic_inc(&pd->usecnt);
index 7a987acf0c0bbdf0b48460371ca164b6cb34a194..00c5478871322a7a85274140b593eb23ebf451e2 100644 (file)
@@ -22,6 +22,8 @@ static void *uapi_add_elm(struct uverbs_api *uapi, u32 key, size_t alloc_size)
                return ERR_PTR(-EOVERFLOW);
 
        elm = kzalloc(alloc_size, GFP_KERNEL);
+       if (!elm)
+               return ERR_PTR(-ENOMEM);
        rc = radix_tree_insert(&uapi->radix, key, elm);
        if (rc) {
                kfree(elm);
@@ -645,7 +647,7 @@ struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev)
                return ERR_PTR(-ENOMEM);
 
        INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL);
-       uapi->driver_id = ibdev->driver_id;
+       uapi->driver_id = ibdev->ops.driver_id;
 
        rc = uapi_merge_def(uapi, ibdev, uverbs_core_api, false);
        if (rc)
index e666a1f7608d868621cdd279f455adef15b3d30b..92349bf37589f79d4fa6a589882c6de5c5c21aa0 100644 (file)
@@ -209,7 +209,7 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
 EXPORT_SYMBOL(ib_rate_to_mbps);
 
 __attribute_const__ enum rdma_transport_type
-rdma_node_get_transport(enum rdma_node_type node_type)
+rdma_node_get_transport(unsigned int node_type)
 {
 
        if (node_type == RDMA_NODE_USNIC)
@@ -299,6 +299,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
 
                mr->device      = pd->device;
                mr->pd          = pd;
+               mr->type        = IB_MR_TYPE_DMA;
                mr->uobject     = NULL;
                mr->need_inval  = false;
 
@@ -316,7 +317,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
 EXPORT_SYMBOL(__ib_alloc_pd);
 
 /**
- * ib_dealloc_pd - Deallocates a protection domain.
+ * ib_dealloc_pd_user - Deallocates a protection domain.
  * @pd: The protection domain to deallocate.
  * @udata: Valid user data or NULL for kernel object
  *
@@ -1157,6 +1158,10 @@ struct ib_qp *ib_create_qp_user(struct ib_pd *pd,
            qp_init_attr->cap.max_recv_sge))
                return ERR_PTR(-EINVAL);
 
+       if ((qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) &&
+           !(device->attrs.device_cap_flags & IB_DEVICE_INTEGRITY_HANDOVER))
+               return ERR_PTR(-EINVAL);
+
        /*
         * If the callers is using the RDMA API calculate the resources
         * needed for the RDMA READ/WRITE operations.
@@ -1232,6 +1237,8 @@ struct ib_qp *ib_create_qp_user(struct ib_pd *pd,
        qp->max_write_sge = qp_init_attr->cap.max_send_sge;
        qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge,
                                 device->attrs.max_sge_rd);
+       if (qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN)
+               qp->integrity_en = true;
 
        return qp;
 
@@ -1683,6 +1690,14 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr,
                }
        }
 
+       /*
+        * Bind this qp to a counter automatically based on the rdma counter
+        * rules. This only set in RST2INIT with port specified
+        */
+       if (!qp->counter && (attr_mask & IB_QP_PORT) &&
+           ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT))
+               rdma_counter_bind_qp_auto(qp, attr->port_num);
+
        ret = ib_security_modify_qp(qp, attr, attr_mask, udata);
        if (ret)
                goto out;
@@ -1878,6 +1893,7 @@ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata)
        if (!qp->uobject)
                rdma_rw_cleanup_mrs(qp);
 
+       rdma_counter_unbind_qp(qp, true);
        rdma_restrack_del(&qp->res);
        ret = qp->device->ops.destroy_qp(qp, udata);
        if (!ret) {
@@ -1916,21 +1932,28 @@ struct ib_cq *__ib_create_cq(struct ib_device *device,
                             const char *caller)
 {
        struct ib_cq *cq;
+       int ret;
+
+       cq = rdma_zalloc_drv_obj(device, ib_cq);
+       if (!cq)
+               return ERR_PTR(-ENOMEM);
 
-       cq = device->ops.create_cq(device, cq_attr, NULL);
-
-       if (!IS_ERR(cq)) {
-               cq->device        = device;
-               cq->uobject       = NULL;
-               cq->comp_handler  = comp_handler;
-               cq->event_handler = event_handler;
-               cq->cq_context    = cq_context;
-               atomic_set(&cq->usecnt, 0);
-               cq->res.type = RDMA_RESTRACK_CQ;
-               rdma_restrack_set_task(&cq->res, caller);
-               rdma_restrack_kadd(&cq->res);
+       cq->device = device;
+       cq->uobject = NULL;
+       cq->comp_handler = comp_handler;
+       cq->event_handler = event_handler;
+       cq->cq_context = cq_context;
+       atomic_set(&cq->usecnt, 0);
+       cq->res.type = RDMA_RESTRACK_CQ;
+       rdma_restrack_set_task(&cq->res, caller);
+
+       ret = device->ops.create_cq(cq, cq_attr, NULL);
+       if (ret) {
+               kfree(cq);
+               return ERR_PTR(ret);
        }
 
+       rdma_restrack_kadd(&cq->res);
        return cq;
 }
 EXPORT_SYMBOL(__ib_create_cq);
@@ -1949,7 +1972,9 @@ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata)
                return -EBUSY;
 
        rdma_restrack_del(&cq->res);
-       return cq->device->ops.destroy_cq(cq, udata);
+       cq->device->ops.destroy_cq(cq, udata);
+       kfree(cq);
+       return 0;
 }
 EXPORT_SYMBOL(ib_destroy_cq_user);
 
@@ -1966,6 +1991,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
 {
        struct ib_pd *pd = mr->pd;
        struct ib_dm *dm = mr->dm;
+       struct ib_sig_attrs *sig_attrs = mr->sig_attrs;
        int ret;
 
        rdma_restrack_del(&mr->res);
@@ -1974,6 +2000,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
                atomic_dec(&pd->usecnt);
                if (dm)
                        atomic_dec(&dm->usecnt);
+               kfree(sig_attrs);
        }
 
        return ret;
@@ -1981,7 +2008,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
 EXPORT_SYMBOL(ib_dereg_mr_user);
 
 /**
- * ib_alloc_mr() - Allocates a memory region
+ * ib_alloc_mr_user() - Allocates a memory region
  * @pd:            protection domain associated with the region
  * @mr_type:       memory region type
  * @max_num_sg:    maximum sg entries available for registration.
@@ -2001,6 +2028,9 @@ struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type,
        if (!pd->device->ops.alloc_mr)
                return ERR_PTR(-EOPNOTSUPP);
 
+       if (WARN_ON_ONCE(mr_type == IB_MR_TYPE_INTEGRITY))
+               return ERR_PTR(-EINVAL);
+
        mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg, udata);
        if (!IS_ERR(mr)) {
                mr->device  = pd->device;
@@ -2011,12 +2041,66 @@ struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type,
                mr->need_inval = false;
                mr->res.type = RDMA_RESTRACK_MR;
                rdma_restrack_kadd(&mr->res);
+               mr->type = mr_type;
+               mr->sig_attrs = NULL;
        }
 
        return mr;
 }
 EXPORT_SYMBOL(ib_alloc_mr_user);
 
+/**
+ * ib_alloc_mr_integrity() - Allocates an integrity memory region
+ * @pd:                      protection domain associated with the region
+ * @max_num_data_sg:         maximum data sg entries available for registration
+ * @max_num_meta_sg:         maximum metadata sg entries available for
+ *                           registration
+ *
+ * Notes:
+ * Memory registration page/sg lists must not exceed max_num_sg,
+ * also the integrity page/sg lists must not exceed max_num_meta_sg.
+ *
+ */
+struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd,
+                                   u32 max_num_data_sg,
+                                   u32 max_num_meta_sg)
+{
+       struct ib_mr *mr;
+       struct ib_sig_attrs *sig_attrs;
+
+       if (!pd->device->ops.alloc_mr_integrity ||
+           !pd->device->ops.map_mr_sg_pi)
+               return ERR_PTR(-EOPNOTSUPP);
+
+       if (!max_num_meta_sg)
+               return ERR_PTR(-EINVAL);
+
+       sig_attrs = kzalloc(sizeof(struct ib_sig_attrs), GFP_KERNEL);
+       if (!sig_attrs)
+               return ERR_PTR(-ENOMEM);
+
+       mr = pd->device->ops.alloc_mr_integrity(pd, max_num_data_sg,
+                                               max_num_meta_sg);
+       if (IS_ERR(mr)) {
+               kfree(sig_attrs);
+               return mr;
+       }
+
+       mr->device = pd->device;
+       mr->pd = pd;
+       mr->dm = NULL;
+       mr->uobject = NULL;
+       atomic_inc(&pd->usecnt);
+       mr->need_inval = false;
+       mr->res.type = RDMA_RESTRACK_MR;
+       rdma_restrack_kadd(&mr->res);
+       mr->type = IB_MR_TYPE_INTEGRITY;
+       mr->sig_attrs = sig_attrs;
+
+       return mr;
+}
+EXPORT_SYMBOL(ib_alloc_mr_integrity);
+
 /* "Fast" memory regions */
 
 struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
@@ -2226,19 +2310,17 @@ EXPORT_SYMBOL(ib_create_wq);
  */
 int ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)
 {
-       int err;
        struct ib_cq *cq = wq->cq;
        struct ib_pd *pd = wq->pd;
 
        if (atomic_read(&wq->usecnt))
                return -EBUSY;
 
-       err = wq->device->ops.destroy_wq(wq, udata);
-       if (!err) {
-               atomic_dec(&pd->usecnt);
-               atomic_dec(&cq->usecnt);
-       }
-       return err;
+       wq->device->ops.destroy_wq(wq, udata);
+       atomic_dec(&pd->usecnt);
+       atomic_dec(&cq->usecnt);
+
+       return 0;
 }
 EXPORT_SYMBOL(ib_destroy_wq);
 
@@ -2375,6 +2457,43 @@ int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid,
 }
 EXPORT_SYMBOL(ib_set_vf_guid);
 
+/**
+ * ib_map_mr_sg_pi() - Map the dma mapped SG lists for PI (protection
+ *     information) and set an appropriate memory region for registration.
+ * @mr:             memory region
+ * @data_sg:        dma mapped scatterlist for data
+ * @data_sg_nents:  number of entries in data_sg
+ * @data_sg_offset: offset in bytes into data_sg
+ * @meta_sg:        dma mapped scatterlist for metadata
+ * @meta_sg_nents:  number of entries in meta_sg
+ * @meta_sg_offset: offset in bytes into meta_sg
+ * @page_size:      page vector desired page size
+ *
+ * Constraints:
+ * - The MR must be allocated with type IB_MR_TYPE_INTEGRITY.
+ *
+ * Return: 0 on success.
+ *
+ * After this completes successfully, the  memory region
+ * is ready for registration.
+ */
+int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg,
+                   int data_sg_nents, unsigned int *data_sg_offset,
+                   struct scatterlist *meta_sg, int meta_sg_nents,
+                   unsigned int *meta_sg_offset, unsigned int page_size)
+{
+       if (unlikely(!mr->device->ops.map_mr_sg_pi ||
+                    WARN_ON_ONCE(mr->type != IB_MR_TYPE_INTEGRITY)))
+               return -EOPNOTSUPP;
+
+       mr->page_size = page_size;
+
+       return mr->device->ops.map_mr_sg_pi(mr, data_sg, data_sg_nents,
+                                           data_sg_offset, meta_sg,
+                                           meta_sg_nents, meta_sg_offset);
+}
+EXPORT_SYMBOL(ib_map_mr_sg_pi);
+
 /**
  * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list
  *     and set it the memory region.
index 77094be1b2627de8d541cb57a06350e67a3281e9..433fca59febdffe3392996b8ed6dbfdfdbeaf28a 100644 (file)
@@ -7,7 +7,6 @@ obj-$(CONFIG_INFINIBAND_EFA)            += efa/
 obj-$(CONFIG_INFINIBAND_I40IW)         += i40iw/
 obj-$(CONFIG_MLX4_INFINIBAND)          += mlx4/
 obj-$(CONFIG_MLX5_INFINIBAND)          += mlx5/
-obj-$(CONFIG_INFINIBAND_NES)           += nes/
 obj-$(CONFIG_INFINIBAND_OCRDMA)                += ocrdma/
 obj-$(CONFIG_INFINIBAND_VMWARE_PVRDMA) += vmw_pvrdma/
 obj-$(CONFIG_INFINIBAND_USNIC)         += usnic/
index 2c3685faa57a42defe87428d5cdf3f01f4136b2f..a91653aabf3899d9c7e3f5dc4be6ab8ff168ac6c 100644 (file)
@@ -805,10 +805,8 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
                rdev->sqp_ah = NULL;
        }
 
-       if (!IS_ERR_OR_NULL(qp->rumem))
-               ib_umem_release(qp->rumem);
-       if (!IS_ERR_OR_NULL(qp->sumem))
-               ib_umem_release(qp->sumem);
+       ib_umem_release(qp->rumem);
+       ib_umem_release(qp->sumem);
 
        mutex_lock(&rdev->qp_lock);
        list_del(&qp->list);
@@ -1201,12 +1199,8 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
 qp_destroy:
        bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
 free_umem:
-       if (udata) {
-               if (qp->rumem)
-                       ib_umem_release(qp->rumem);
-               if (qp->sumem)
-                       ib_umem_release(qp->sumem);
-       }
+       ib_umem_release(qp->rumem);
+       ib_umem_release(qp->sumem);
 fail:
        kfree(qp);
        return ERR_PTR(rc);
@@ -1302,8 +1296,7 @@ void bnxt_re_destroy_srq(struct ib_srq *ib_srq, struct ib_udata *udata)
        if (qplib_srq->cq)
                nq = qplib_srq->cq->nq;
        bnxt_qplib_destroy_srq(&rdev->qplib_res, qplib_srq);
-       if (srq->umem)
-               ib_umem_release(srq->umem);
+       ib_umem_release(srq->umem);
        atomic_dec(&rdev->srq_count);
        if (nq)
                nq->budget--;
@@ -1412,8 +1405,7 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq,
        return 0;
 
 fail:
-       if (srq->umem)
-               ib_umem_release(srq->umem);
+       ib_umem_release(srq->umem);
 exit:
        return rc;
 }
@@ -2517,9 +2509,8 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, const struct ib_recv_wr *wr,
 }
 
 /* Completion Queues */
-int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
+void bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
-       int rc;
        struct bnxt_re_cq *cq;
        struct bnxt_qplib_nq *nq;
        struct bnxt_re_dev *rdev;
@@ -2528,29 +2519,20 @@ int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
        rdev = cq->rdev;
        nq = cq->qplib_cq.nq;
 
-       rc = bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq);
-       if (rc) {
-               dev_err(rdev_to_dev(rdev), "Failed to destroy HW CQ");
-               return rc;
-       }
-       if (!IS_ERR_OR_NULL(cq->umem))
-               ib_umem_release(cq->umem);
+       bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq);
+       ib_umem_release(cq->umem);
 
        atomic_dec(&rdev->cq_count);
        nq->budget--;
        kfree(cq->cql);
-       kfree(cq);
-
-       return 0;
 }
 
-struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
-                               const struct ib_cq_init_attr *attr,
-                               struct ib_udata *udata)
+int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                     struct ib_udata *udata)
 {
-       struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+       struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibcq->device, ibdev);
        struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
-       struct bnxt_re_cq *cq = NULL;
+       struct bnxt_re_cq *cq = container_of(ibcq, struct bnxt_re_cq, ib_cq);
        int rc, entries;
        int cqe = attr->cqe;
        struct bnxt_qplib_nq *nq = NULL;
@@ -2559,11 +2541,8 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
        /* Validate CQ fields */
        if (cqe < 1 || cqe > dev_attr->max_cq_wqes) {
                dev_err(rdev_to_dev(rdev), "Failed to create CQ -max exceeded");
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
        }
-       cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-       if (!cq)
-               return ERR_PTR(-ENOMEM);
 
        cq->rdev = rdev;
        cq->qplib_cq.cq_handle = (u64)(unsigned long)(&cq->qplib_cq);
@@ -2641,15 +2620,13 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
                }
        }
 
-       return &cq->ib_cq;
+       return 0;
 
 c2fail:
-       if (udata)
-               ib_umem_release(cq->umem);
+       ib_umem_release(cq->umem);
 fail:
        kfree(cq->cql);
-       kfree(cq);
-       return ERR_PTR(rc);
+       return rc;
 }
 
 static u8 __req_to_ib_wc_status(u8 qstatus)
@@ -3353,8 +3330,7 @@ int bnxt_re_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
                mr->npages = 0;
 &n