Merge tag 'nfs-for-4.1-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 27 Apr 2015 00:33:59 +0000 (17:33 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 27 Apr 2015 00:33:59 +0000 (17:33 -0700)
Pull NFS client updates from Trond Myklebust:
 "Another set of mainly bugfixes and a couple of cleanups.  No new
  functionality in this round.

  Highlights include:

  Stable patches:
   - Fix a regression in /proc/self/mountstats
   - Fix the pNFS flexfiles O_DIRECT support
   - Fix high load average due to callback thread sleeping

  Bugfixes:
   - Various patches to fix the pNFS layoutcommit support
   - Do not cache pNFS deviceids unless server notifications are enabled
   - Fix a SUNRPC transport reconnection regression
   - make debugfs file creation failure non-fatal in SUNRPC
   - Another fix for circular directory warnings on NFSv4 "junctioned"
     mountpoints
   - Fix locking around NFSv4.2 fallocate() support
   - Truncating NFSv4 file opens should also sync O_DIRECT writes
   - Prevent infinite loop in rpcrdma_ep_create()

  Features:
   - Various improvements to the RDMA transport code's handling of
     memory registration
   - Various code cleanups"

* tag 'nfs-for-4.1-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (55 commits)
  fs/nfs: fix new compiler warning about boolean in switch
  nfs: Remove unneeded casts in nfs
  NFS: Don't attempt to decode missing directory entries
  Revert "nfs: replace nfs_add_stats with nfs_inc_stats when add one"
  NFS: Rename idmap.c to nfs4idmap.c
  NFS: Move nfs_idmap.h into fs/nfs/
  NFS: Remove CONFIG_NFS_V4 checks from nfs_idmap.h
  NFS: Add a stub for GETDEVICELIST
  nfs: remove WARN_ON_ONCE from nfs_direct_good_bytes
  nfs: fix DIO good bytes calculation
  nfs: Fetch MOUNTED_ON_FILEID when updating an inode
  sunrpc: make debugfs file creation failure non-fatal
  nfs: fix high load average due to callback thread sleeping
  NFS: Reduce time spent holding the i_mutex during fallocate()
  NFS: Don't zap caches on fallocate()
  xprtrdma: Make rpcrdma_{un}map_one() into inline functions
  xprtrdma: Handle non-SEND completions via a callout
  xprtrdma: Add "open" memreg op
  xprtrdma: Add "destroy MRs" memreg op
  xprtrdma: Add "reset MRs" memreg op
  ...

49 files changed:
fs/nfs/Makefile
fs/nfs/blocklayout/blocklayout.c
fs/nfs/blocklayout/dev.c
fs/nfs/callback.c
fs/nfs/client.c
fs/nfs/delegation.c
fs/nfs/dir.c
fs/nfs/direct.c
fs/nfs/file.c
fs/nfs/filelayout/filelayout.c
fs/nfs/filelayout/filelayoutdev.c
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/flexfilelayout/flexfilelayoutdev.c
fs/nfs/inode.c
fs/nfs/nfs42proc.c
fs/nfs/nfs42xdr.c
fs/nfs/nfs4client.c
fs/nfs/nfs4file.c
fs/nfs/nfs4idmap.c [moved from fs/nfs/idmap.c with 99% similarity]
fs/nfs/nfs4idmap.h [moved from include/linux/nfs_idmap.h with 94% similarity]
fs/nfs/nfs4proc.c
fs/nfs/nfs4state.c
fs/nfs/nfs4super.c
fs/nfs/nfs4sysctl.c
fs/nfs/nfs4xdr.c
fs/nfs/nfstrace.c
fs/nfs/objlayout/objio_osd.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/nfs/pnfs_dev.c
fs/nfs/pnfs_nfs.c
fs/nfs/read.c
fs/nfs/super.c
fs/nfs/write.c
include/linux/nfs_fs.h
include/linux/nfs_xdr.h
include/linux/sunrpc/msg_prot.h
include/linux/sunrpc/xprtrdma.h
include/uapi/linux/nfs_idmap.h
net/sunrpc/sched.c
net/sunrpc/xprt.c
net/sunrpc/xprtrdma/Makefile
net/sunrpc/xprtrdma/fmr_ops.c [new file with mode: 0644]
net/sunrpc/xprtrdma/frwr_ops.c [new file with mode: 0644]
net/sunrpc/xprtrdma/physical_ops.c [new file with mode: 0644]
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h

index 1e987acf20c941312d31cd22bd153222f92e8d71..8664417955a2730b0813a2984b2fe4689a38e3ce 100644 (file)
@@ -22,7 +22,7 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
 obj-$(CONFIG_NFS_V4) += nfsv4.o
 CFLAGS_nfs4trace.o += -I$(src)
 nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
-         delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
+         delegation.o nfs4idmap.o callback.o callback_xdr.o callback_proc.o \
          nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \
          dns_resolve.o nfs4trace.o
 nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
index 1cac3c175d1870e63126c37ba4e1c377b5ffe5ca..d2554fe140a3e106fa93bf8d2f048fcad66a2730 100644 (file)
@@ -890,6 +890,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
        .free_deviceid_node             = bl_free_deviceid_node,
        .pg_read_ops                    = &bl_pg_read_ops,
        .pg_write_ops                   = &bl_pg_write_ops,
+       .sync                           = pnfs_generic_sync,
 };
 
 static int __init nfs4blocklayout_init(void)
index 5aed4f98df411be1d7612f566f5929d1db28c97a..e535599a07191619c28eba93342388a59267114e 100644 (file)
@@ -33,7 +33,7 @@ bl_free_deviceid_node(struct nfs4_deviceid_node *d)
                container_of(d, struct pnfs_block_dev, node);
 
        bl_free_device(dev);
-       kfree(dev);
+       kfree_rcu(dev, node.rcu);
 }
 
 static int
index 351be9205bf889bd48bf6b1f39b195899e5ab141..8d129bb7355afbb2ca7f1904ff0263f604e86dd6 100644 (file)
@@ -128,7 +128,7 @@ nfs41_callback_svc(void *vrqstp)
                if (try_to_freeze())
                        continue;
 
-               prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_UNINTERRUPTIBLE);
+               prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
                spin_lock_bh(&serv->sv_cb_lock);
                if (!list_empty(&serv->sv_cb_list)) {
                        req = list_first_entry(&serv->sv_cb_list,
@@ -142,10 +142,10 @@ nfs41_callback_svc(void *vrqstp)
                                error);
                } else {
                        spin_unlock_bh(&serv->sv_cb_lock);
-                       /* schedule_timeout to game the hung task watchdog */
-                       schedule_timeout(60 * HZ);
+                       schedule();
                        finish_wait(&serv->sv_cb_waitq, &wq);
                }
+               flush_signals(current);
        }
        return 0;
 }
index 19874151e95c9908660132e6a3e2e6d7d2d3c8e4..892aefff36300a0861f9bf5d43cb7a4b5069d873 100644 (file)
@@ -31,7 +31,6 @@
 #include <linux/lockd/bind.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
-#include <linux/nfs_idmap.h>
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/in6.h>
index a6ad688658803424d726afae85597ad2ace80044..029d688a969f4427e57ccf6f19b0dca9035c3d71 100644 (file)
@@ -378,7 +378,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
                if (freeme == NULL)
                        goto out;
        }
-       list_add_rcu(&delegation->super_list, &server->delegations);
+       list_add_tail_rcu(&delegation->super_list, &server->delegations);
        rcu_assign_pointer(nfsi->delegation, delegation);
        delegation = NULL;
 
@@ -514,7 +514,7 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode)
 
        delegation = nfs_inode_detach_delegation(inode);
        if (delegation != NULL)
-               nfs_do_return_delegation(inode, delegation, 0);
+               nfs_do_return_delegation(inode, delegation, 1);
 }
 
 /**
index 1e51ecd618544186d9d013a39ea666ee46ec7a0e..b2c8b31b2be77d9a1d524b230ed2b66e479ad3fe 100644 (file)
@@ -543,6 +543,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
        if (scratch == NULL)
                return -ENOMEM;
 
+       if (buflen == 0)
+               goto out_nopages;
+
        xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
 
@@ -564,6 +567,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
                        break;
        } while (!entry->eof);
 
+out_nopages:
        if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
                array = nfs_readdir_get_array(page);
                if (!IS_ERR(array)) {
index b2cbc3a6cdd9dd2702c97872f8caa14038dbe88a..38678d9a5cc4a64838e70ff3c915107b828bb65d 100644 (file)
@@ -129,22 +129,25 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
        int i;
        ssize_t count;
 
-       WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count);
-
-       count = dreq->mirrors[hdr->pgio_mirror_idx].count;
-       if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
-               count = hdr->io_start + hdr->good_bytes - dreq->io_start;
-               dreq->mirrors[hdr->pgio_mirror_idx].count = count;
-       }
-
-       /* update the dreq->count by finding the minimum agreed count from all
-        * mirrors */
-       count = dreq->mirrors[0].count;
+       if (dreq->mirror_count == 1) {
+               dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes;
+               dreq->count += hdr->good_bytes;
+       } else {
+               /* mirrored writes */
+               count = dreq->mirrors[hdr->pgio_mirror_idx].count;
+               if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
+                       count = hdr->io_start + hdr->good_bytes - dreq->io_start;
+                       dreq->mirrors[hdr->pgio_mirror_idx].count = count;
+               }
+               /* update the dreq->count by finding the minimum agreed count from all
+                * mirrors */
+               count = dreq->mirrors[0].count;
 
-       for (i = 1; i < dreq->mirror_count; i++)
-               count = min(count, dreq->mirrors[i].count);
+               for (i = 1; i < dreq->mirror_count; i++)
+                       count = min(count, dreq->mirrors[i].count);
 
-       dreq->count = count;
+               dreq->count = count;
+       }
 }
 
 /*
@@ -258,18 +261,11 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
        if (!IS_SWAPFILE(inode))
                return 0;
 
-#ifndef CONFIG_NFS_SWAP
-       dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n",
-                       iocb->ki_filp, (long long) pos, iter->nr_segs);
-
-       return -EINVAL;
-#else
        VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
 
        if (iov_iter_rw(iter) == READ)
                return nfs_file_direct_read(iocb, iter, pos);
        return nfs_file_direct_write(iocb, iter);
-#endif /* CONFIG_NFS_SWAP */
 }
 
 static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
@@ -1030,6 +1026,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
                        if (i_size_read(inode) < iocb->ki_pos)
                                i_size_write(inode, iocb->ki_pos);
                        spin_unlock(&inode->i_lock);
+                       generic_write_sync(file, pos, result);
                }
        }
        nfs_direct_req_release(dreq);
index c40e4363e746eab4014991f326005aa27721b460..8b8d83a526ce2366ae974a87761c9c62c22da7f5 100644 (file)
@@ -280,6 +280,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 
        trace_nfs_fsync_enter(inode);
 
+       nfs_inode_dio_wait(inode);
        do {
                ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
                if (ret != 0)
@@ -782,7 +783,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
         * Flush all pending writes before doing anything
         * with locks..
         */
-       nfs_sync_mapping(filp->f_mapping);
+       vfs_fsync(filp, 0);
 
        l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
        if (!IS_ERR(l_ctx)) {
index 91e88a7ecef0c64354b0278fc01381e0970d2086..a46bf6de9ce455a97ef18275efe4906ac10e12ed 100644 (file)
@@ -258,7 +258,8 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
            hdr->res.verf->committed != NFS_DATA_SYNC)
                return;
 
-       pnfs_set_layoutcommit(hdr);
+       pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
+                       hdr->mds_offset + hdr->res.count);
        dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
                (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
 }
@@ -373,7 +374,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
        }
 
        if (data->verf.committed == NFS_UNSTABLE)
-               pnfs_commit_set_layoutcommit(data);
+               pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
 
        return 0;
 }
@@ -1086,7 +1087,7 @@ filelayout_alloc_deviceid_node(struct nfs_server *server,
 }
 
 static void
-filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
+filelayout_free_deviceid_node(struct nfs4_deviceid_node *d)
 {
        nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
 }
@@ -1137,7 +1138,8 @@ static struct pnfs_layoutdriver_type filelayout_type = {
        .read_pagelist          = filelayout_read_pagelist,
        .write_pagelist         = filelayout_write_pagelist,
        .alloc_deviceid_node    = filelayout_alloc_deviceid_node,
-       .free_deviceid_node     = filelayout_free_deveiceid_node,
+       .free_deviceid_node     = filelayout_free_deviceid_node,
+       .sync                   = pnfs_nfs_generic_sync,
 };
 
 static int __init nfs4filelayout_init(void)
index 4f372e2246034415583485aa680bd43a0a83a04f..4946ef40ba875e6255857ca3c27b6df352c7a46a 100644 (file)
@@ -55,7 +55,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
                        nfs4_pnfs_ds_put(ds);
        }
        kfree(dsaddr->stripe_indices);
-       kfree(dsaddr);
+       kfree_rcu(dsaddr, id_node.rcu);
 }
 
 /* Decode opaque device data and return the result */
index 315cc68945b9d1d3ad00b04e2597e38d7b7c4cab..7d05089e52d6c8b7a29e92e80011bdee0e06a32c 100644 (file)
 #include <linux/module.h>
 
 #include <linux/sunrpc/metrics.h>
-#include <linux/nfs_idmap.h>
 
 #include "flexfilelayout.h"
 #include "../nfs4session.h"
+#include "../nfs4idmap.h"
 #include "../internal.h"
 #include "../delegation.h"
 #include "../nfs4trace.h"
@@ -891,7 +891,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
 static void
 ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
-       pnfs_set_layoutcommit(hdr);
+       pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
+                       hdr->mds_offset + hdr->res.count);
        dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
                (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
 }
@@ -1074,7 +1075,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
        }
 
        if (data->verf.committed == NFS_UNSTABLE)
-               pnfs_commit_set_layoutcommit(data);
+               pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
 
        return 0;
 }
@@ -1414,7 +1415,7 @@ ff_layout_get_ds_info(struct inode *inode)
 }
 
 static void
-ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d)
+ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d)
 {
        nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
                                                  id_node));
@@ -1498,7 +1499,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
        .pg_read_ops            = &ff_layout_pg_read_ops,
        .pg_write_ops           = &ff_layout_pg_write_ops,
        .get_ds_info            = ff_layout_get_ds_info,
-       .free_deviceid_node     = ff_layout_free_deveiceid_node,
+       .free_deviceid_node     = ff_layout_free_deviceid_node,
        .mark_request_commit    = pnfs_layout_mark_request_commit,
        .clear_request_commit   = pnfs_generic_clear_request_commit,
        .scan_commit_lists      = pnfs_generic_scan_commit_lists,
@@ -1508,6 +1509,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
        .write_pagelist         = ff_layout_write_pagelist,
        .alloc_deviceid_node    = ff_layout_alloc_deviceid_node,
        .encode_layoutreturn    = ff_layout_encode_layoutreturn,
+       .sync                   = pnfs_nfs_generic_sync,
 };
 
 static int __init nfs4flexfilelayout_init(void)
index e2c01f204a956b584725d1d187d1ea1fc22d6ab0..77a2d026aa12b62bdc29dac2345cff0b3237e9c4 100644 (file)
@@ -30,7 +30,7 @@ void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
 {
        nfs4_print_deviceid(&mirror_ds->id_node.deviceid);
        nfs4_pnfs_ds_put(mirror_ds->ds);
-       kfree(mirror_ds);
+       kfree_rcu(mirror_ds, id_node.rcu);
 }
 
 /* Decode opaque device data and construct new_ds using it */
index 3689e95da79aea3cf4ad68feb54001f156a931fa..f734562c6d244034cb5036fee8ab0b7d69cc90c5 100644 (file)
@@ -133,6 +133,13 @@ void nfs_evict_inode(struct inode *inode)
        nfs_clear_inode(inode);
 }
 
+int nfs_sync_inode(struct inode *inode)
+{
+       nfs_inode_dio_wait(inode);
+       return nfs_wb_all(inode);
+}
+EXPORT_SYMBOL_GPL(nfs_sync_inode);
+
 /**
  * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
  */
@@ -192,7 +199,6 @@ void nfs_zap_caches(struct inode *inode)
        nfs_zap_caches_locked(inode);
        spin_unlock(&inode->i_lock);
 }
-EXPORT_SYMBOL_GPL(nfs_zap_caches);
 
 void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
 {
@@ -525,10 +531,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
        trace_nfs_setattr_enter(inode);
 
        /* Write all dirty data */
-       if (S_ISREG(inode->i_mode)) {
-               nfs_inode_dio_wait(inode);
-               nfs_wb_all(inode);
-       }
+       if (S_ISREG(inode->i_mode))
+               nfs_sync_inode(inode);
 
        fattr = nfs_alloc_fattr();
        if (fattr == NULL)
@@ -644,8 +648,9 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
        trace_nfs_getattr_enter(inode);
        /* Flush out writes to the server in order to update c/mtime.  */
        if (S_ISREG(inode->i_mode)) {
-               nfs_inode_dio_wait(inode);
-               err = filemap_write_and_wait(inode->i_mapping);
+               mutex_lock(&inode->i_mutex);
+               err = nfs_sync_inode(inode);
+               mutex_unlock(&inode->i_mutex);
                if (err)
                        goto out;
        }
@@ -1588,6 +1593,19 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
 }
 EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc);
 
+
+static inline bool nfs_fileid_valid(struct nfs_inode *nfsi,
+                                   struct nfs_fattr *fattr)
+{
+       bool ret1 = true, ret2 = true;
+
+       if (fattr->valid & NFS_ATTR_FATTR_FILEID)
+               ret1 = (nfsi->fileid == fattr->fileid);
+       if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
+               ret2 = (nfsi->fileid == fattr->mounted_on_fileid);
+       return ret1 || ret2;
+}
+
 /*
  * Many nfs protocol calls return the new file attributes after
  * an operation.  Here we update the inode to reflect the state
@@ -1614,7 +1632,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        nfs_display_fhandle_hash(NFS_FH(inode)),
                        atomic_read(&inode->i_count), fattr->valid);
 
-       if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) {
+       if (!nfs_fileid_valid(nfsi, fattr)) {
                printk(KERN_ERR "NFS: server %s error: fileid changed\n"
                        "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n",
                        NFS_SERVER(inode)->nfs_client->cl_hostname,
@@ -1819,7 +1837,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 struct inode *nfs_alloc_inode(struct super_block *sb)
 {
        struct nfs_inode *nfsi;
-       nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL);
+       nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL);
        if (!nfsi)
                return NULL;
        nfsi->flags = 0UL;
index cb170722769cd807990177274bc5a70e64bf806a..3a9e75235f30e60e5a4ae974864c63fb003b969d 100644 (file)
@@ -36,13 +36,16 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
                                 loff_t offset, loff_t len)
 {
        struct inode *inode = file_inode(filep);
+       struct nfs_server *server = NFS_SERVER(inode);
        struct nfs42_falloc_args args = {
                .falloc_fh      = NFS_FH(inode),
                .falloc_offset  = offset,
                .falloc_length  = len,
+               .falloc_bitmask = server->cache_consistency_bitmask,
+       };
+       struct nfs42_falloc_res res = {
+               .falloc_server  = server,
        };
-       struct nfs42_falloc_res res;
-       struct nfs_server *server = NFS_SERVER(inode);
        int status;
 
        msg->rpc_argp = &args;
@@ -52,8 +55,17 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
        if (status)
                return status;
 
-       return nfs4_call_sync(server->client, server, msg,
-                             &args.seq_args, &res.seq_res, 0);
+       res.falloc_fattr = nfs_alloc_fattr();
+       if (!res.falloc_fattr)
+               return -ENOMEM;
+
+       status = nfs4_call_sync(server->client, server, msg,
+                               &args.seq_args, &res.seq_res, 0);
+       if (status == 0)
+               status = nfs_post_op_update_inode(inode, res.falloc_fattr);
+
+       kfree(res.falloc_fattr);
+       return status;
 }
 
 static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
@@ -84,9 +96,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
        if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
                return -EOPNOTSUPP;
 
+       mutex_lock(&inode->i_mutex);
+
        err = nfs42_proc_fallocate(&msg, filep, offset, len);
        if (err == -EOPNOTSUPP)
                NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE;
+
+       mutex_unlock(&inode->i_mutex);
        return err;
 }
 
@@ -101,9 +117,16 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
        if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE))
                return -EOPNOTSUPP;
 
+       nfs_wb_all(inode);
+       mutex_lock(&inode->i_mutex);
+
        err = nfs42_proc_fallocate(&msg, filep, offset, len);
+       if (err == 0)
+               truncate_pagecache_range(inode, offset, (offset + len) -1);
        if (err == -EOPNOTSUPP)
                NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
+
+       mutex_unlock(&inode->i_mutex);
        return err;
 }
 
index 038a7e1521fa8f0d157aaebdd6d39a78ff9a7eab..1a25b27248f2ff5fd0026b00a3032589ee8cdff5 100644 (file)
 
 #define NFS4_enc_allocate_sz           (compound_encode_hdr_maxsz + \
                                         encode_putfh_maxsz + \
-                                        encode_allocate_maxsz)
+                                        encode_allocate_maxsz + \
+                                        encode_getattr_maxsz)
 #define NFS4_dec_allocate_sz           (compound_decode_hdr_maxsz + \
                                         decode_putfh_maxsz + \
-                                        decode_allocate_maxsz)
+                                        decode_allocate_maxsz + \
+                                        decode_getattr_maxsz)
 #define NFS4_enc_deallocate_sz         (compound_encode_hdr_maxsz + \
                                         encode_putfh_maxsz + \
-                                        encode_deallocate_maxsz)
+                                        encode_deallocate_maxsz + \
+                                        encode_getattr_maxsz)
 #define NFS4_dec_deallocate_sz         (compound_decode_hdr_maxsz + \
                                         decode_putfh_maxsz + \
-                                        decode_deallocate_maxsz)
+                                        decode_deallocate_maxsz + \
+                                        decode_getattr_maxsz)
 #define NFS4_enc_seek_sz               (compound_encode_hdr_maxsz + \
                                         encode_putfh_maxsz + \
                                         encode_seek_maxsz)
@@ -92,6 +96,7 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
        encode_sequence(xdr, &args->seq_args, &hdr);
        encode_putfh(xdr, args->falloc_fh, &hdr);
        encode_allocate(xdr, args, &hdr);
+       encode_getfattr(xdr, args->falloc_bitmask, &hdr);
        encode_nops(&hdr);
 }
 
@@ -110,6 +115,7 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req,
        encode_sequence(xdr, &args->seq_args, &hdr);
        encode_putfh(xdr, args->falloc_fh, &hdr);
        encode_deallocate(xdr, args, &hdr);
+       encode_getfattr(xdr, args->falloc_bitmask, &hdr);
        encode_nops(&hdr);
 }
 
@@ -183,6 +189,9 @@ static int nfs4_xdr_dec_allocate(struct rpc_rqst *rqstp,
        if (status)
                goto out;
        status = decode_allocate(xdr, res);
+       if (status)
+               goto out;
+       decode_getfattr(xdr, res->falloc_fattr, res->falloc_server);
 out:
        return status;
 }
@@ -207,6 +216,9 @@ static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp,
        if (status)
                goto out;
        status = decode_deallocate(xdr, res);
+       if (status)
+               goto out;
+       decode_getfattr(xdr, res->falloc_fattr, res->falloc_server);
 out:
        return status;
 }
index 51c2dbd1e942dd14c588e3020c6cd5fc6e956f97..e42be52a8c18d8121c934a5ac79483e77166206c 100644 (file)
@@ -4,7 +4,6 @@
  */
 #include <linux/module.h>
 #include <linux/nfs_fs.h>
-#include <linux/nfs_idmap.h>
 #include <linux/nfs_mount.h>
 #include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/auth.h>
@@ -15,6 +14,7 @@
 #include "callback.h"
 #include "delegation.h"
 #include "nfs4session.h"
+#include "nfs4idmap.h"
 #include "pnfs.h"
 #include "netns.h"
 
index 619eca34e70fcc258b04b824f62d95b2f142765a..f58c17b3b480367c6322359ae7ca4b33b3695348 100644 (file)
@@ -10,6 +10,8 @@
 #include "fscache.h"
 #include "pnfs.h"
 
+#include "nfstrace.h"
+
 #ifdef CONFIG_NFS_V4_2
 #include "nfs42.h"
 #endif
@@ -57,7 +59,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
        if (openflags & O_TRUNC) {
                attr.ia_valid |= ATTR_SIZE;
                attr.ia_size = 0;
-               nfs_wb_all(inode);
+               nfs_sync_inode(inode);
        }
 
        inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened);
@@ -100,6 +102,9 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        int ret;
        struct inode *inode = file_inode(file);
 
+       trace_nfs_fsync_enter(inode);
+
+       nfs_inode_dio_wait(inode);
        do {
                ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
                if (ret != 0)
@@ -107,7 +112,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
                mutex_lock(&inode->i_mutex);
                ret = nfs_file_fsync_commit(file, start, end, datasync);
                if (!ret)
-                       ret = pnfs_layoutcommit_inode(inode, true);
+                       ret = pnfs_sync_inode(inode, !!datasync);
                mutex_unlock(&inode->i_mutex);
                /*
                 * If nfs_file_fsync_commit detected a server reboot, then
@@ -118,6 +123,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
                end = LLONG_MAX;
        } while (ret == -EAGAIN);
 
+       trace_nfs_fsync_exit(inode, ret);
        return ret;
 }
 
@@ -152,15 +158,9 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
        if (ret < 0)
                return ret;
 
-       mutex_lock(&inode->i_mutex);
        if (mode & FALLOC_FL_PUNCH_HOLE)
-               ret = nfs42_proc_deallocate(filep, offset, len);
-       else
-               ret = nfs42_proc_allocate(filep, offset, len);
-       mutex_unlock(&inode->i_mutex);
-
-       nfs_zap_caches(inode);
-       return ret;
+               return nfs42_proc_deallocate(filep, offset, len);
+       return nfs42_proc_allocate(filep, offset, len);
 }
 #endif /* CONFIG_NFS_V4_2 */
 
similarity index 99%
rename from fs/nfs/idmap.c
rename to fs/nfs/nfs4idmap.c
index 857e2a99acc8d30e6bcc85a7b7eb9e0b193ae5bc..2e1737c40a29837488823e04f27db975e0a51b9b 100644 (file)
@@ -36,7 +36,6 @@
 #include <linux/types.h>
 #include <linux/parser.h>
 #include <linux/fs.h>
-#include <linux/nfs_idmap.h>
 #include <net/net_namespace.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
 #include <linux/nfs_fs.h>
@@ -49,6 +48,7 @@
 
 #include "internal.h"
 #include "netns.h"
+#include "nfs4idmap.h"
 #include "nfs4trace.h"
 
 #define NFS_UINT_MAXLEN 11
similarity index 94%
rename from include/linux/nfs_idmap.h
rename to fs/nfs/nfs4idmap.h
index 333844e38f66c4918d3693a96cfdb0b50f9b031d..de44d7330ab3ba7bf1f36ced933f33f47b814fc3 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * include/linux/nfs_idmap.h
+ * fs/nfs/nfs4idmap.h
  *
  *  UID and GID to name mapping for clients.
  *
@@ -46,19 +46,8 @@ struct nfs_server;
 struct nfs_fattr;
 struct nfs4_string;
 
-#if IS_ENABLED(CONFIG_NFS_V4)
 int nfs_idmap_init(void);
 void nfs_idmap_quit(void);
-#else
-static inline int nfs_idmap_init(void)
-{
-       return 0;
-}
-
-static inline void nfs_idmap_quit(void)
-{}
-#endif
-
 int nfs_idmap_new(struct nfs_client *);
 void nfs_idmap_delete(struct nfs_client *);
 
index 98e533f2c94a60c5bd0dd5ea5af66b1974fac197..45b35b9b1e36a1213a2c2736e4ae551dcd0d8848 100644 (file)
@@ -51,7 +51,6 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/module.h>
-#include <linux/nfs_idmap.h>
 #include <linux/xattr.h>
 #include <linux/utsname.h>
 #include <linux/freezer.h>
@@ -63,6 +62,7 @@
 #include "callback.h"
 #include "pnfs.h"
 #include "netns.h"
+#include "nfs4idmap.h"
 #include "nfs4session.h"
 #include "fscache.h"
 
@@ -185,7 +185,8 @@ const u32 nfs4_fattr_bitmap[3] = {
        | FATTR4_WORD1_SPACE_USED
        | FATTR4_WORD1_TIME_ACCESS
        | FATTR4_WORD1_TIME_METADATA
-       | FATTR4_WORD1_TIME_MODIFY,
+       | FATTR4_WORD1_TIME_MODIFY
+       | FATTR4_WORD1_MOUNTED_ON_FILEID,
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
        FATTR4_WORD2_SECURITY_LABEL
 #endif
@@ -3095,16 +3096,13 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
                         struct nfs_fsinfo *info,
                         bool auth_probe)
 {
-       int status;
+       int status = 0;
 
-       switch (auth_probe) {
-       case false:
+       if (!auth_probe)
                status = nfs4_lookup_root(server, fhandle, info);
-               if (status != -NFS4ERR_WRONGSEC)
-                       break;
-       default:
+
+       if (auth_probe || status == NFS4ERR_WRONGSEC)
                status = nfs4_do_find_root_sec(server, fhandle, info);
-       }
 
        if (status == 0)
                status = nfs4_server_capabilities(server, fhandle);
@@ -7944,6 +7942,8 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server,
 {
        struct nfs4_getdeviceinfo_args args = {
                .pdev = pdev,
+               .notify_types = NOTIFY_DEVICEID4_CHANGE |
+                       NOTIFY_DEVICEID4_DELETE,
        };
        struct nfs4_getdeviceinfo_res res = {
                .pdev = pdev,
@@ -7958,6 +7958,11 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server,
 
        dprintk("--> %s\n", __func__);
        status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+       if (res.notification & ~args.notify_types)
+               dprintk("%s: unsupported notification\n", __func__);
+       if (res.notification != args.notify_types)
+               pdev->nocache = 1;
+
        dprintk("<-- %s status=%d\n", __func__, status);
 
        return status;
index 3b2b20534a3a78c965d023d4c6eef1860b513ef6..2782cfca22650922e012a4f86a1755e3cca68243 100644 (file)
@@ -42,7 +42,6 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/nfs_fs.h>
-#include <linux/nfs_idmap.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/random.h>
@@ -57,6 +56,7 @@
 #include "callback.h"
 #include "delegation.h"
 #include "internal.h"
+#include "nfs4idmap.h"
 #include "nfs4session.h"
 #include "pnfs.h"
 #include "netns.h"
index 75090feeafade9c790bb4bf0d84c8546a46819a0..6fb7cb6b3f4b038398e3c8ef1681cc4d274636a5 100644 (file)
@@ -3,12 +3,12 @@
  */
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/nfs_idmap.h>
 #include <linux/nfs4_mount.h>
 #include <linux/nfs_fs.h>
 #include "delegation.h"
 #include "internal.h"
 #include "nfs4_fs.h"
+#include "nfs4idmap.h"
 #include "dns_resolve.h"
 #include "pnfs.h"
 #include "nfs.h"
@@ -91,10 +91,11 @@ static void nfs4_evict_inode(struct inode *inode)
 {
        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
-       pnfs_return_layout(inode);
-       pnfs_destroy_layout(NFS_I(inode));
        /* If we are holding a delegation, return it! */
        nfs_inode_return_delegation_noreclaim(inode);
+       /* Note that above delegreturn would trigger pnfs return-on-close */
+       pnfs_return_layout(inode);
+       pnfs_destroy_layout(NFS_I(inode));
        /* First call standard NFS clear_inode() code */
        nfs_clear_inode(inode);
 }
index b6ebe7e445f63d34c5bb1d8cc6ef587867cc6813..0fbd3ab1be222e9d3c47a7c96c16e11955724ad0 100644 (file)
@@ -6,10 +6,10 @@
  * Copyright (c) 2006 Trond Myklebust <Trond.Myklebust@netapp.com>
  */
 #include <linux/sysctl.h>
-#include <linux/nfs_idmap.h>
 #include <linux/nfs_fs.h>
 
 #include "nfs4_fs.h"
+#include "nfs4idmap.h"
 #include "callback.h"
 
 static const int nfs_set_port_min = 0;
index 5c399ec41079687791d2fb668d5f6543ce374a2f..0aea97841d3038b56056d0d7fcd0dcddb11f584e 100644 (file)
 #include <linux/nfs.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
-#include <linux/nfs_idmap.h>
 
 #include "nfs4_fs.h"
 #include "internal.h"
+#include "nfs4idmap.h"
 #include "nfs4session.h"
 #include "pnfs.h"
 #include "netns.h"
@@ -1920,7 +1920,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
 
        p = reserve_space(xdr, 4 + 4);
        *p++ = cpu_to_be32(1);                  /* bitmap length */
-       *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE);
+       *p++ = cpu_to_be32(args->notify_types);
 }
 
 static void
@@ -5753,8 +5753,9 @@ out_overflow:
 
 #if defined(CONFIG_NFS_V4_1)
 static int decode_getdeviceinfo(struct xdr_stream *xdr,
-                               struct pnfs_device *pdev)
+                               struct nfs4_getdeviceinfo_res *res)
 {
+       struct pnfs_device *pdev = res->pdev;
        __be32 *p;
        uint32_t len, type;
        int status;
@@ -5802,12 +5803,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
                if (unlikely(!p))
                        goto out_overflow;
 
-               if (be32_to_cpup(p++) &
-                   ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) {
-                       dprintk("%s: unsupported notification\n",
-                               __func__);
-               }
-
+               res->notification = be32_to_cpup(p++);
                for (i = 1; i < len; i++) {
                        if (be32_to_cpup(p++)) {
                                dprintk("%s: unsupported notification\n",
@@ -7061,7 +7057,7 @@ static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status != 0)
                goto out;
-       status = decode_getdeviceinfo(xdr, res->pdev);
+       status = decode_getdeviceinfo(xdr, res);
 out:
        return status;
 }
@@ -7365,6 +7361,11 @@ nfs4_stat_to_errno(int stat)
        .p_name   = #proc,                                      \
 }
 
+#define STUB(proc)             \
+[NFSPROC4_CLNT_##proc] = {     \
+       .p_name = #proc,        \
+}
+
 struct rpc_procinfo    nfs4_procedures[] = {
        PROC(READ,              enc_read,               dec_read),
        PROC(WRITE,             enc_write,              dec_write),
@@ -7417,6 +7418,7 @@ struct rpc_procinfo       nfs4_procedures[] = {
        PROC(SECINFO_NO_NAME,   enc_secinfo_no_name,    dec_secinfo_no_name),
        PROC(TEST_STATEID,      enc_test_stateid,       dec_test_stateid),
        PROC(FREE_STATEID,      enc_free_stateid,       dec_free_stateid),
+       STUB(GETDEVICELIST),
        PROC(BIND_CONN_TO_SESSION,
                        enc_bind_conn_to_session, dec_bind_conn_to_session),
        PROC(DESTROY_CLIENTID,  enc_destroy_clientid,   dec_destroy_clientid),
index 4eb0aead69b6c954de19f64099fdd9268f36350d..c74f7af23d77a756b1f785fa78723c00c26a4312 100644 (file)
@@ -7,3 +7,6 @@
 
 #define CREATE_TRACE_POINTS
 #include "nfstrace.h"
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit);
index 24e1d7403c0be241fa90dd6645f86dddadf0add9..5aaed363556a66e7bd5bb43d4619d037108f5421 100644 (file)
@@ -57,7 +57,7 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
 
        dprintk("%s: free od=%p\n", __func__, de->od.od);
        osduld_put_device(de->od.od);
-       kfree(de);
+       kfree_rcu(d, rcu);
 }
 
 struct objio_segment {
@@ -637,6 +637,8 @@ static struct pnfs_layoutdriver_type objlayout_type = {
        .pg_read_ops             = &objio_pg_read_ops,
        .pg_write_ops            = &objio_pg_write_ops,
 
+       .sync                    = pnfs_generic_sync,
+
        .free_deviceid_node      = objio_free_deviceid_node,
 
        .encode_layoutcommit     = objlayout_encode_layoutcommit,
index 4f802b02fbb9b0c6fc61912be044f6cef94acd32..230606243be6ad079733e583d173b14d3baeda55 100644 (file)
@@ -1090,6 +1090,7 @@ bool pnfs_roc(struct inode *ino)
        pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&tmp_list);
+       pnfs_layoutcommit_inode(ino, true);
        return true;
 
 out_noroc:
@@ -1104,8 +1105,10 @@ out_noroc:
                }
        }
        spin_unlock(&ino->i_lock);
-       if (layoutreturn)
+       if (layoutreturn) {
+               pnfs_layoutcommit_inode(ino, true);
                pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+       }
        return false;
 }
 
@@ -1841,7 +1844,8 @@ void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
 {
        trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
        if (!hdr->pnfs_error) {
-               pnfs_set_layoutcommit(hdr);
+               pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
+                               hdr->mds_offset + hdr->res.count);
                hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
        } else
                pnfs_ld_handle_write_error(hdr);
@@ -1902,7 +1906,6 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
        pnfs_put_lseg(hdr->lseg);
        nfs_pgio_header_free(hdr);
 }
-EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
 
 int
 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
@@ -2032,7 +2035,6 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
        pnfs_put_lseg(hdr->lseg);
        nfs_pgio_header_free(hdr);
 }
-EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
 
 int
 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
@@ -2099,64 +2101,34 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
 EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
 
 void
-pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
+pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg,
+               loff_t end_pos)
 {
-       struct inode *inode = hdr->inode;
        struct nfs_inode *nfsi = NFS_I(inode);
-       loff_t end_pos = hdr->mds_offset + hdr->res.count;
        bool mark_as_dirty = false;
 
        spin_lock(&inode->i_lock);
        if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
-               mark_as_dirty = true;
-               dprintk("%s: Set layoutcommit for inode %lu ",
-                       __func__, inode->i_ino);
-       }
-       if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) {
-               /* references matched in nfs4_layoutcommit_release */
-               pnfs_get_lseg(hdr->lseg);
-       }
-       if (end_pos > nfsi->layout->plh_lwb)
                nfsi->layout->plh_lwb = end_pos;
-       spin_unlock(&inode->i_lock);
-       dprintk("%s: lseg %p end_pos %llu\n",
-               __func__, hdr->lseg, nfsi->layout->plh_lwb);
-
-       /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
-        * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
-       if (mark_as_dirty)
-               mark_inode_dirty_sync(inode);
-}
-EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
-
-void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data)
-{
-       struct inode *inode = data->inode;
-       struct nfs_inode *nfsi = NFS_I(inode);
-       bool mark_as_dirty = false;
-
-       spin_lock(&inode->i_lock);
-       if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
                mark_as_dirty = true;
                dprintk("%s: Set layoutcommit for inode %lu ",
                        __func__, inode->i_ino);
-       }
-       if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) {
+       } else if (end_pos > nfsi->layout->plh_lwb)
+               nfsi->layout->plh_lwb = end_pos;
+       if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) {
                /* references matched in nfs4_layoutcommit_release */
-               pnfs_get_lseg(data->lseg);
+               pnfs_get_lseg(lseg);
        }
-       if (data->lwb > nfsi->layout->plh_lwb)
-               nfsi->layout->plh_lwb = data->lwb;
        spin_unlock(&inode->i_lock);
        dprintk("%s: lseg %p end_pos %llu\n",
-               __func__, data->lseg, nfsi->layout->plh_lwb);
+               __func__, lseg, nfsi->layout->plh_lwb);
 
        /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
         * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
        if (mark_as_dirty)
                mark_inode_dirty_sync(inode);
 }
-EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit);
+EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
 
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
 {
@@ -2216,7 +2188,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
        pnfs_list_write_lseg(inode, &data->lseg_list);
 
        end_pos = nfsi->layout->plh_lwb;
-       nfsi->layout->plh_lwb = 0;
 
        nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
        spin_unlock(&inode->i_lock);
@@ -2233,11 +2204,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
                status = ld->prepare_layoutcommit(&data->args);
                if (status) {
                        spin_lock(&inode->i_lock);
-                       if (end_pos < nfsi->layout->plh_lwb)
+                       set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
+                       if (end_pos > nfsi->layout->plh_lwb)
                                nfsi->layout->plh_lwb = end_pos;
                        spin_unlock(&inode->i_lock);
                        put_rpccred(data->cred);
-                       set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
                        goto clear_layoutcommitting;
                }
        }
@@ -2258,6 +2229,13 @@ clear_layoutcommitting:
 }
 EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
 
+int
+pnfs_generic_sync(struct inode *inode, bool datasync)
+{
+       return pnfs_layoutcommit_inode(inode, true);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_sync);
+
 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
 {
        struct nfs4_threshold *thp;
index 084c9144f86db477330c7504158efd476bf80137..1e6308f82fc3d5887de850e72226233e4f2138d2 100644 (file)
@@ -155,6 +155,8 @@ struct pnfs_layoutdriver_type {
                               int how,
                               struct nfs_commit_info *cinfo);
 
+       int (*sync)(struct inode *inode, bool datasync);
+
        /*
         * Return PNFS_ATTEMPTED to indicate the layout code has attempted
         * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
@@ -203,6 +205,7 @@ struct pnfs_device {
        struct page **pages;
        unsigned int  pgbase;
        unsigned int  pglen;    /* reply buffer length */
+       unsigned char nocache : 1;/* May not be cached */
 };
 
 #define NFS4_PNFS_GETDEVLIST_MAXNUM 16
@@ -263,10 +266,11 @@ bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
-void pnfs_set_layoutcommit(struct nfs_pgio_header *);
-void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data);
+void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
+int pnfs_generic_sync(struct inode *inode, bool datasync);
+int pnfs_nfs_generic_sync(struct inode *inode, bool datasync);
 int _pnfs_return_layout(struct inode *);
 int pnfs_commit_and_return_layout(struct inode *);
 void pnfs_ld_write_done(struct nfs_pgio_header *);
@@ -291,6 +295,7 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
 enum {
        NFS_DEVICEID_INVALID = 0,       /* set when MDS clientid recalled */
        NFS_DEVICEID_UNAVAILABLE,       /* device temporarily unavailable */
+       NFS_DEVICEID_NOCACHE,           /* device may not be cached */
 };
 
 /* pnfs_dev.c */
@@ -302,6 +307,7 @@ struct nfs4_deviceid_node {
        unsigned long                   flags;
        unsigned long                   timestamp_unavailable;
        struct nfs4_deviceid            deviceid;
+       struct rcu_head                 rcu;
        atomic_t                        ref;
 };
 
@@ -486,6 +492,14 @@ pnfs_ld_read_whole_page(struct inode *inode)
        return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE;
 }
 
+static inline int
+pnfs_sync_inode(struct inode *inode, bool datasync)
+{
+       if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+               return 0;
+       return NFS_SERVER(inode)->pnfs_curr_ld->sync(inode, datasync);
+}
+
 static inline bool
 pnfs_layoutcommit_outstanding(struct inode *inode)
 {
@@ -568,6 +582,12 @@ pnfs_ld_read_whole_page(struct inode *inode)
        return false;
 }
 
+static inline int
+pnfs_sync_inode(struct inode *inode, bool datasync)
+{
+       return 0;
+}
+
 static inline bool
 pnfs_roc(struct inode *ino)
 {
index aa2ec00151839f1519a2c7e0ab65509d9f2cfba3..2961fcd7a2df9292bf4f611287c2e48bdfe31ad2 100644 (file)
@@ -149,6 +149,8 @@ nfs4_get_device_info(struct nfs_server *server,
         */
        d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
                        gfp_flags);
+       if (d && pdev->nocache)
+               set_bit(NFS_DEVICEID_NOCACHE, &d->flags);
 
 out_free_pages:
        for (i = 0; i < max_pages; i++)
@@ -175,8 +177,8 @@ __nfs4_find_get_deviceid(struct nfs_server *server,
        rcu_read_lock();
        d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
                        hash);
-       if (d != NULL)
-               atomic_inc(&d->ref);
+       if (d != NULL && !atomic_inc_not_zero(&d->ref))
+               d = NULL;
        rcu_read_unlock();
        return d;
 }
@@ -235,12 +237,11 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
                return;
        }
        hlist_del_init_rcu(&d->node);
+       clear_bit(NFS_DEVICEID_NOCACHE, &d->flags);
        spin_unlock(&nfs4_deviceid_lock);
-       synchronize_rcu();
 
        /* balance the initial ref set in pnfs_insert_deviceid */
-       if (atomic_dec_and_test(&d->ref))
-               d->ld->free_deviceid_node(d);
+       nfs4_put_deviceid_node(d);
 }
 EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
 
@@ -271,6 +272,11 @@ EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
 bool
 nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
 {
+       if (test_bit(NFS_DEVICEID_NOCACHE, &d->flags)) {
+               if (atomic_add_unless(&d->ref, -1, 2))
+                       return false;
+               nfs4_delete_deviceid(d->ld, d->nfs_client, &d->deviceid);
+       }
        if (!atomic_dec_and_test(&d->ref))
                return false;
        d->ld->free_deviceid_node(d);
@@ -314,6 +320,7 @@ _deviceid_purge_client(const struct nfs_client *clp, long hash)
                if (d->nfs_client == clp && atomic_read(&d->ref)) {
                        hlist_del_init_rcu(&d->node);
                        hlist_add_head(&d->tmpnode, &tmp);
+                       clear_bit(NFS_DEVICEID_NOCACHE, &d->flags);
                }
        rcu_read_unlock();
        spin_unlock(&nfs4_deviceid_lock);
@@ -321,12 +328,10 @@ _deviceid_purge_client(const struct nfs_client *clp, long hash)
        if (hlist_empty(&tmp))
                return;
 
-       synchronize_rcu();
        while (!hlist_empty(&tmp)) {
                d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode);
                hlist_del(&d->tmpnode);
-               if (atomic_dec_and_test(&d->ref))
-                       d->ld->free_deviceid_node(d);
+               nfs4_put_deviceid_node(d);
        }
 }
 
index 54e36b38fb5f89310287635e0838601ad07cf34a..f37e25b6311c83ac890508207be3e7d6cdc7bdda 100644 (file)
@@ -561,7 +561,7 @@ static bool load_v3_ds_connect(void)
        return(get_v3_ds_connect != NULL);
 }
 
-void __exit nfs4_pnfs_v3_ds_connect_unload(void)
+void nfs4_pnfs_v3_ds_connect_unload(void)
 {
        if (get_v3_ds_connect) {
                symbol_put(nfs3_set_ds_client);
@@ -868,3 +868,13 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
        nfs_request_add_commit_list(req, list, cinfo);
 }
 EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
+
+int
+pnfs_nfs_generic_sync(struct inode *inode, bool datasync)
+{
+       if (datasync)
+               return 0;
+       return pnfs_layoutcommit_inode(inode, true);
+}
+EXPORT_SYMBOL_GPL(pnfs_nfs_generic_sync);
+
index a5b7427c37548dab4d7c3625c9f3e6096f3fde5c..ae0ff7a11b40339a728ca819283400c7f5e25a42 100644 (file)
@@ -284,7 +284,7 @@ int nfs_readpage(struct file *file, struct page *page)
        dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
                page, PAGE_CACHE_SIZE, page_file_index(page));
        nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
-       nfs_inc_stats(inode, NFSIOS_READPAGES);
+       nfs_add_stats(inode, NFSIOS_READPAGES, 1);
 
        /*
         * Try to flush any pending writes to the file..
index 21f8f52bf37d02b509294a940cdfa0456e970ea7..f175b833b6ba75b22bced1f9d013ad670caed82d 100644 (file)
@@ -43,7 +43,6 @@
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
-#include <linux/nfs_idmap.h>
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/in6.h>
@@ -2193,7 +2192,7 @@ nfs_compare_remount_data(struct nfs_server *nfss,
            data->version != nfss->nfs_client->rpc_ops->version ||
            data->minorversion != nfss->nfs_client->cl_minorversion ||
            data->retrans != nfss->client->cl_timeout->to_retries ||
-           data->selected_flavor != nfss->client->cl_auth->au_flavor ||
+           !nfs_auth_info_match(&data->auth_info, nfss->client->cl_auth->au_flavor) ||
            data->acregmin != nfss->acregmin / HZ ||
            data->acregmax != nfss->acregmax / HZ ||
            data->acdirmin != nfss->acdirmin / HZ ||
@@ -2241,7 +2240,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
        data->wsize = nfss->wsize;
        data->retrans = nfss->client->cl_timeout->to_retries;
        data->selected_flavor = nfss->client->cl_auth->au_flavor;
-       data->auth_info = nfss->auth_info;
        data->acregmin = nfss->acregmin / HZ;
        data->acregmax = nfss->acregmax / HZ;
        data->acdirmin = nfss->acdirmin / HZ;
index 3612b4622337bb3ea3c4c5ff3b72ae6ae2e78e24..d12a4be613a5ced58599f8095822f4659b32f9a4 100644 (file)
@@ -580,7 +580,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
        int ret;
 
        nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
-       nfs_inc_stats(inode, NFSIOS_WRITEPAGES);
+       nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
 
        nfs_pageio_cond_complete(pgio, page_file_index(page));
        ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
@@ -1840,17 +1840,16 @@ EXPORT_SYMBOL_GPL(nfs_write_inode);
  */
 int nfs_wb_all(struct inode *inode)
 {
-       struct writeback_control wbc = {
-               .sync_mode = WB_SYNC_ALL,
-               .nr_to_write = LONG_MAX,
-               .range_start = 0,
-               .range_end = LLONG_MAX,
-       };
        int ret;
 
        trace_nfs_writeback_inode_enter(inode);
 
-       ret = sync_inode(inode, &wbc);
+       ret = filemap_write_and_wait(inode->i_mapping);
+       if (!ret) {
+               ret = nfs_commit_inode(inode, FLUSH_SYNC);
+               if (!ret)
+                       pnfs_sync_inode(inode, true);
+       }
 
        trace_nfs_writeback_inode_exit(inode, ret);
        return ret;
index 410abd172febee648d8f63cab2e00592773ddaed..b95f914ce083891325b6b8defa3f9995851cf76b 100644 (file)
@@ -511,6 +511,7 @@ extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned
  * Try to write back everything synchronously (but check the
  * return value!)
  */
+extern int nfs_sync_inode(struct inode *inode);
 extern int nfs_wb_all(struct inode *inode);
 extern int nfs_wb_page(struct inode *inode, struct page* page);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
index 4cb3eaa89cf708a57038049db0df75144155d920..93ab6071bbe967b56ea44c56111157f8a9135e3f 100644 (file)
@@ -255,11 +255,13 @@ struct nfs4_layoutget {
 struct nfs4_getdeviceinfo_args {
        struct nfs4_sequence_args seq_args;
        struct pnfs_device *pdev;
+       __u32 notify_types;
 };
 
 struct nfs4_getdeviceinfo_res {
        struct nfs4_sequence_res seq_res;
        struct pnfs_device *pdev;
+       __u32 notification;
 };
 
 struct nfs4_layoutcommit_args {
@@ -1271,11 +1273,15 @@ struct nfs42_falloc_args {
        nfs4_stateid                     falloc_stateid;
        u64                              falloc_offset;
        u64                              falloc_length;
+       const u32                       *falloc_bitmask;
 };
 
 struct nfs42_falloc_res {
        struct nfs4_sequence_res        seq_res;
        unsigned int                    status;
+
+       struct nfs_fattr                *falloc_fattr;
+       const struct nfs_server         *falloc_server;
 };
 
 struct nfs42_seek_args {
index aadc6a04e1acb91bb4baa40b545237d4822dfaee..807371357160ae97f6d6120ba83b26bbc48b8e66 100644 (file)
@@ -142,12 +142,18 @@ typedef __be32    rpc_fraghdr;
        (RPC_REPHDRSIZE + (2 + RPC_MAX_AUTH_SIZE/4))
 
 /*
- * RFC1833/RFC3530 rpcbind (v3+) well-known netid's.
+ * Well-known netids. See:
+ *
+ *   http://www.iana.org/assignments/rpc-netids/rpc-netids.xhtml
  */
 #define RPCBIND_NETID_UDP      "udp"
 #define RPCBIND_NETID_TCP      "tcp"
+#define RPCBIND_NETID_RDMA     "rdma"
+#define RPCBIND_NETID_SCTP     "sctp"
 #define RPCBIND_NETID_UDP6     "udp6"
 #define RPCBIND_NETID_TCP6     "tcp6"
+#define RPCBIND_NETID_RDMA6    "rdma6"
+#define RPCBIND_NETID_SCTP6    "sctp6"
 #define RPCBIND_NETID_LOCAL    "local"
 
 /*
index 64a0a0a97b2396492352f99fa1a1a04f225ef06a..c984c85981eae2881ebdac43ffdbfba9b55b70b4 100644 (file)
 #ifndef _LINUX_SUNRPC_XPRTRDMA_H
 #define _LINUX_SUNRPC_XPRTRDMA_H
 
-/*
- * rpcbind (v3+) RDMA netid.
- */
-#define RPCBIND_NETID_RDMA     "rdma"
-
 /*
  * Constants. Max RPC/NFS header is big enough to account for
  * additional marshaling buffers passed down by Linux client.
index 8d4b1c7b24d4a57e3ec46b4d2e6b249fd946017f..038e36c96669ef915bcca3ad42c8b0ef8b4eb065 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * include/linux/nfs_idmap.h
+ * include/uapi/linux/nfs_idmap.h
  *
  *  UID and GID to name mapping for clients.
  *
index b91fd9c597b45158cc9ce88a8fb4caa5130faf96..337ca851a350cc412532ff4d3c96d87a8605e785 100644 (file)
@@ -89,8 +89,8 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
        if (!task->tk_timeout)
                return;
 
-       dprintk("RPC: %5u setting alarm for %lu ms\n",
-                       task->tk_pid, task->tk_timeout * 1000 / HZ);
+       dprintk("RPC: %5u setting alarm for %u ms\n",
+               task->tk_pid, jiffies_to_msecs(task->tk_timeout));
 
        task->u.tk_wait.expires = jiffies + task->tk_timeout;
        if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires))
index 9949722d99cebf6afa15953d8a9ac6a5c0bc2824..1d4fe24af06a1115bd80538c5346ae2f843f1eb8 100644 (file)
@@ -326,6 +326,15 @@ out_unlock:
        xprt_clear_locked(xprt);
 }
 
+static void xprt_task_clear_bytes_sent(struct rpc_task *task)
+{
+       if (task != NULL) {
+               struct rpc_rqst *req = task->tk_rqstp;
+               if (req != NULL)
+                       req->rq_bytes_sent = 0;
+       }
+}
+
 /**
  * xprt_release_xprt - allow other requests to use a transport
  * @xprt: transport with other tasks potentially waiting
@@ -336,11 +345,7 @@ out_unlock:
 void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
 {
        if (xprt->snd_task == task) {
-               if (task != NULL) {
-                       struct rpc_rqst *req = task->tk_rqstp;
-                       if (req != NULL)
-                               req->rq_bytes_sent = 0;
-               }
+               xprt_task_clear_bytes_sent(task);
                xprt_clear_locked(xprt);
                __xprt_lock_write_next(xprt);
        }
@@ -358,11 +363,7 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt);
 void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
 {
        if (xprt->snd_task == task) {
-               if (task != NULL) {
-                       struct rpc_rqst *req = task->tk_rqstp;
-                       if (req != NULL)
-                               req->rq_bytes_sent = 0;
-               }
+               xprt_task_clear_bytes_sent(task);
                xprt_clear_locked(xprt);
                __xprt_lock_write_next_cong(xprt);
        }
@@ -700,6 +701,7 @@ bool xprt_lock_connect(struct rpc_xprt *xprt,
                goto out;
        if (xprt->snd_task != task)
                goto out;
+       xprt_task_clear_bytes_sent(task);
        xprt->snd_task = cookie;
        ret = true;
 out:
index da5136fd56943fa91e20946d2a89346c3da556b2..579f72bbcf4ba66eb58b7c9cfde0bde812c996e2 100644 (file)
@@ -1,6 +1,7 @@
 obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o
 
-xprtrdma-y := transport.o rpc_rdma.o verbs.o
+xprtrdma-y := transport.o rpc_rdma.o verbs.o \
+       fmr_ops.o frwr_ops.o physical_ops.o
 
 obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o
 
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
new file mode 100644 (file)
index 0000000..302d4eb
--- /dev/null
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2015 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* Lightweight memory registration using Fast Memory Regions (FMR).
+ * Referred to sometimes as MTHCAFMR mode.
+ *
+ * FMR uses synchronous memory registration and deregistration.
+ * FMR registration is known to be fast, but FMR deregistration
+ * can take tens of usecs to complete.
+ */
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY       RPCDBG_TRANS
+#endif
+
+/* Maximum scatter/gather per FMR */
+#define RPCRDMA_MAX_FMR_SGES   (64)
+
+static int
+fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
+           struct rpcrdma_create_data_internal *cdata)
+{
+       return 0;
+}
+
+/* FMR mode conveys up to 64 pages of payload per chunk segment.
+ */
+static size_t
+fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
+{
+       return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+                    rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES);
+}
+
+static int
+fmr_op_init(struct rpcrdma_xprt *r_xprt)
+{
+       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+       int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
+       struct ib_fmr_attr fmr_attr = {
+               .max_pages      = RPCRDMA_MAX_FMR_SGES,
+               .max_maps       = 1,
+               .page_shift     = PAGE_SHIFT
+       };
+       struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+       struct rpcrdma_mw *r;
+       int i, rc;
+
+       INIT_LIST_HEAD(&buf->rb_mws);
+       INIT_LIST_HEAD(&buf->rb_all);
+
+       i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+       dprintk("RPC:       %s: initializing %d FMRs\n", __func__, i);
+
+       while (i--) {
+               r = kzalloc(sizeof(*r), GFP_KERNEL);
+               if (!r)
+                       return -ENOMEM;
+
+               r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
+               if (IS_ERR(r->r.fmr))
+                       goto out_fmr_err;
+
+               list_add(&r->mw_list, &buf->rb_mws);
+               list_add(&r->mw_all, &buf->rb_all);
+       }
+       return 0;
+
+out_fmr_err:
+       rc = PTR_ERR(r->r.fmr);
+       dprintk("RPC:       %s: ib_alloc_fmr status %i\n", __func__, rc);
+       kfree(r);
+       return rc;
+}
+
+/* Use the ib_map_phys_fmr() verb to register a memory region
+ * for remote access via RDMA READ or RDMA WRITE.
+ */
+static int
+fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+          int nsegs, bool writing)
+{
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+       struct ib_device *device = ia->ri_id->device;
+       enum dma_data_direction direction = rpcrdma_data_dir(writing);
+       struct rpcrdma_mr_seg *seg1 = seg;
+       struct rpcrdma_mw *mw = seg1->rl_mw;
+       u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
+       int len, pageoff, i, rc;
+
+       pageoff = offset_in_page(seg1->mr_offset);
+       seg1->mr_offset -= pageoff;     /* start of page */
+       seg1->mr_len += pageoff;
+       len = -pageoff;
+       if (nsegs > RPCRDMA_MAX_FMR_SGES)
+               nsegs = RPCRDMA_MAX_FMR_SGES;
+       for (i = 0; i < nsegs;) {
+               rpcrdma_map_one(device, seg, direction);
+               physaddrs[i] = seg->mr_dma;
+               len += seg->mr_len;
+               ++seg;
+               ++i;
+               /* Check for holes */
+               if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+                   offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+                       break;
+       }
+
+       rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma);
+       if (rc)
+               goto out_maperr;
+
+       seg1->mr_rkey = mw->r.fmr->rkey;
+       seg1->mr_base = seg1->mr_dma + pageoff;
+       seg1->mr_nsegs = i;
+       seg1->mr_len = len;
+       return i;
+
+out_maperr:
+       dprintk("RPC:       %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
+               __func__, len, (unsigned long long)seg1->mr_dma,
+               pageoff, i, rc);
+       while (i--)
+               rpcrdma_unmap_one(device, --seg);
+       return rc;
+}
+
+/* Use the ib_unmap_fmr() verb to prevent further remote
+ * access via RDMA READ or RDMA WRITE.
+ */
+static int
+fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+       struct rpcrdma_mr_seg *seg1 = seg;
+       struct ib_device *device;
+       int rc, nsegs = seg->mr_nsegs;
+       LIST_HEAD(l);
+
+       list_add(&seg1->rl_mw->r.fmr->list, &l);
+       rc = ib_unmap_fmr(&l);
+       read_lock(&ia->ri_qplock);
+       device = ia->ri_id->device;
+       while (seg1->mr_nsegs--)
+               rpcrdma_unmap_one(device, seg++);
+       read_unlock(&ia->ri_qplock);
+       if (rc)
+               goto out_err;
+       return nsegs;
+
+out_err:
+       dprintk("RPC:       %s: ib_unmap_fmr status %i\n", __func__, rc);
+       return nsegs;
+}
+
+/* After a disconnect, unmap all FMRs.
+ *
+ * This is invoked only in the transport connect worker in order
+ * to serialize with rpcrdma_register_fmr_external().
+ */
+static void
+fmr_op_reset(struct rpcrdma_xprt *r_xprt)
+{
+       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+       struct rpcrdma_mw *r;
+       LIST_HEAD(list);
+       int rc;
+
+       list_for_each_entry(r, &buf->rb_all, mw_all)
+               list_add(&r->r.fmr->list, &list);
+
+       rc = ib_unmap_fmr(&list);
+       if (rc)
+               dprintk("RPC:       %s: ib_unmap_fmr failed %i\n",
+                       __func__, rc);
+}
+
+static void
+fmr_op_destroy(struct rpcrdma_buffer *buf)
+{
+       struct rpcrdma_mw *r;
+       int rc;
+
+       while (!list_empty(&buf->rb_all)) {
+               r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+               list_del(&r->mw_all);
+               rc = ib_dealloc_fmr(r->r.fmr);
+               if (rc)
+                       dprintk("RPC:       %s: ib_dealloc_fmr failed %i\n",
+                               __func__, rc);
+               kfree(r);
+       }
+}
+
+const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
+       .ro_map                         = fmr_op_map,
+       .ro_unmap                       = fmr_op_unmap,
+       .ro_open                        = fmr_op_open,
+       .ro_maxpages                    = fmr_op_maxpages,
+       .ro_init                        = fmr_op_init,
+       .ro_reset                       = fmr_op_reset,
+       .ro_destroy                     = fmr_op_destroy,
+       .ro_displayname                 = "fmr",
+};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
new file mode 100644 (file)
index 0000000..dff0481
--- /dev/null
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2015 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* Lightweight memory registration using Fast Registration Work
+ * Requests (FRWR). Also referred to sometimes as FRMR mode.
+ *
+ * FRWR features ordered asynchronous registration and deregistration
+ * of arbitrarily sized memory regions. This is the fastest and safest
+ * but most complex memory registration mode.
+ */
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY       RPCDBG_TRANS
+#endif
+
+static int
+__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
+           unsigned int depth)
+{
+       struct rpcrdma_frmr *f = &r->r.frmr;
+       int rc;
+
+       f->fr_mr = ib_alloc_fast_reg_mr(pd, depth);
+       if (IS_ERR(f->fr_mr))
+               goto out_mr_err;
+       f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth);
+       if (IS_ERR(f->fr_pgl))
+               goto out_list_err;
+       return 0;
+
+out_mr_err:
+       rc = PTR_ERR(f->fr_mr);
+       dprintk("RPC:       %s: ib_alloc_fast_reg_mr status %i\n",
+               __func__, rc);
+       return rc;
+
+out_list_err:
+       rc = PTR_ERR(f->fr_pgl);
+       dprintk("RPC:       %s: ib_alloc_fast_reg_page_list status %i\n",
+               __func__, rc);
+       ib_dereg_mr(f->fr_mr);
+       return rc;
+}
+
+static void
+__frwr_release(struct rpcrdma_mw *r)
+{
+       int rc;
+
+       rc = ib_dereg_mr(r->r.frmr.fr_mr);
+       if (rc)
+               dprintk("RPC:       %s: ib_dereg_mr status %i\n",
+                       __func__, rc);
+       ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+}
+
+static int
+frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
+            struct rpcrdma_create_data_internal *cdata)
+{
+       struct ib_device_attr *devattr = &ia->ri_devattr;
+       int depth, delta;
+
+       ia->ri_max_frmr_depth =
+                       min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+                             devattr->max_fast_reg_page_list_len);
+       dprintk("RPC:       %s: device's max FR page list len = %u\n",
+               __func__, ia->ri_max_frmr_depth);
+
+       /* Add room for frmr register and invalidate WRs.
+        * 1. FRMR reg WR for head
+        * 2. FRMR invalidate WR for head
+        * 3. N FRMR reg WRs for pagelist
+        * 4. N FRMR invalidate WRs for pagelist
+        * 5. FRMR reg WR for tail
+        * 6. FRMR invalidate WR for tail
+        * 7. The RDMA_SEND WR
+        */
+       depth = 7;
+
+       /* Calculate N if the device max FRMR depth is smaller than
+        * RPCRDMA_MAX_DATA_SEGS.
+        */
+       if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
+               delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth;
+               do {
+                       depth += 2; /* FRMR reg + invalidate */
+                       delta -= ia->ri_max_frmr_depth;
+               } while (delta > 0);
+       }
+
+       ep->rep_attr.cap.max_send_wr *= depth;
+       if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
+               cdata->max_requests = devattr->max_qp_wr / depth;
+               if (!cdata->max_requests)
+                       return -EINVAL;
+               ep->rep_attr.cap.max_send_wr = cdata->max_requests *
+                                              depth;
+       }
+
+       return 0;
+}
+
+/* FRWR mode conveys a list of pages per chunk segment. The
+ * maximum length of that list is the FRWR page list depth.
+ */
+static size_t
+frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
+{
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+       return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+                    rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
+}
+
+/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */
+static void
+frwr_sendcompletion(struct ib_wc *wc)
+{
+       struct rpcrdma_mw *r;
+
+       if (likely(wc->status == IB_WC_SUCCESS))
+               return;
+
+       /* WARNING: Only wr_id and status are reliable at this point */
+       r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+       dprintk("RPC:       %s: frmr %p (stale), status %d\n",
+               __func__, r, wc->status);
+       r->r.frmr.fr_state = FRMR_IS_STALE;
+}
+
+static int
+frwr_op_init(struct rpcrdma_xprt *r_xprt)
+{
+       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+       struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+       unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
+       struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+       int i;
+
+       INIT_LIST_HEAD(&buf->rb_mws);
+       INIT_LIST_HEAD(&buf->rb_all);
+
+       i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+       dprintk("RPC:       %s: initializing %d FRMRs\n", __func__, i);
+
+       while (i--) {
+               struct rpcrdma_mw *r;
+               int rc;
+
+               r = kzalloc(sizeof(*r), GFP_KERNEL);
+               if (!r)
+                       return -ENOMEM;
+
+               rc = __frwr_init(r, pd, device, depth);
+               if (rc) {
+                       kfree(r);
+                       return rc;
+               }
+
+               list_add(&r->mw_list, &buf->rb_mws);
+               list_add(&r->mw_all, &buf->rb_all);
+               r->mw_sendcompletion = frwr_sendcompletion;
+       }
+
+       return 0;
+}
+
+/* Post a FAST_REG Work Request to register a memory region
+ * for remote access via RDMA READ or RDMA WRITE.
+ */
+static int
+frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+           int nsegs, bool writing)
+{
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+       struct ib_device *device = ia->ri_id->device;
+       enum dma_data_direction direction = rpcrdma_data_dir(writing);
+       struct rpcrdma_mr_seg *seg1 = seg;
+       struct rpcrdma_mw *mw = seg1->rl_mw;
+       struct rpcrdma_frmr *frmr = &mw->r.frmr;
+       struct ib_mr *mr = frmr->fr_mr;
+       struct ib_send_wr fastreg_wr, *bad_wr;
+       u8 key;
+       int len, pageoff;
+       int i, rc;
+       int seg_len;
+       u64 pa;
+       int page_no;
+
+       pageoff = offset_in_page(seg1->mr_offset);
+       seg1->mr_offset -= pageoff;     /* start of page */
+       seg1->mr_len += pageoff;
+       len = -pageoff;
+       if (nsegs > ia->ri_max_frmr_depth)
+               nsegs = ia->ri_max_frmr_depth;
+       for (page_no = i = 0; i < nsegs;) {
+               rpcrdma_map_one(device, seg, direction);
+               pa = seg->mr_dma;
+               for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
+                       frmr->fr_pgl->page_list[page_no++] = pa;
+                       pa += PAGE_SIZE;
+               }
+               len += seg->mr_len;
+               ++seg;
+               ++i;
+               /* Check for holes */
+               if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+                   offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+                       break;
+       }
+       dprintk("RPC:       %s: Using frmr %p to map %d segments (%d bytes)\n",
+               __func__, mw, i, len);
+
+       frmr->fr_state = FRMR_IS_VALID;
+
+       memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+       fastreg_wr.wr_id = (unsigned long)(void *)mw;
+       fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+       fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff;
+       fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
+       fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+       fastreg_wr.wr.fast_reg.page_list_len = page_no;
+       fastreg_wr.wr.fast_reg.length = len;
+       fastreg_wr.wr.fast_reg.access_flags = writing ?
+                               IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+                               IB_ACCESS_REMOTE_READ;
+       key = (u8)(mr->rkey & 0x000000FF);
+       ib_update_fast_reg_key(mr, ++key);
+       fastreg_wr.wr.fast_reg.rkey = mr->rkey;
+
+       DECR_CQCOUNT(&r_xprt->rx_ep);
+       rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
+       if (rc)
+               goto out_senderr;
+
+       seg1->mr_rkey = mr->rkey;
+       seg1->mr_base = seg1->mr_dma + pageoff;
+       seg1->mr_nsegs = i;
+       seg1->mr_len = len;
+       return i;
+
+out_senderr:
+       dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
+       ib_update_fast_reg_key(mr, --key);
+       frmr->fr_state = FRMR_IS_INVALID;
+       while (i--)
+               rpcrdma_unmap_one(device, --seg);
+       return rc;
+}
+
+/* Post a LOCAL_INV Work Request to prevent further remote access
+ * via RDMA READ or RDMA WRITE.
+ */
+static int
+frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+       struct rpcrdma_mr_seg *seg1 = seg;
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+       struct ib_send_wr invalidate_wr, *bad_wr;
+       int rc, nsegs = seg->mr_nsegs;
+       struct ib_device *device;
+
+       seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
+
+       memset(&invalidate_wr, 0, sizeof(invalidate_wr));
+       invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
+       invalidate_wr.opcode = IB_WR_LOCAL_INV;
+       invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
+       DECR_CQCOUNT(&r_xprt->rx_ep);
+
+       read_lock(&ia->ri_qplock);
+       device = ia->ri_id->device;
+       while (seg1->mr_nsegs--)
+               rpcrdma_unmap_one(device, seg++);
+       rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+       read_unlock(&ia->ri_qplock);
+       if (rc)
+               goto out_err;
+       return nsegs;
+
+out_err:
+       /* Force rpcrdma_buffer_get() to retry */
+       seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
+       dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
+       return nsegs;
+}
+
+/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
+ * an unusable state. Find FRMRs in this state and dereg / reg
+ * each.  FRMRs that are VALID and attached to an rpcrdma_req are
+ * also torn down.
+ *
+ * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
+ *
+ * This is invoked only in the transport connect worker in order
+ * to serialize with rpcrdma_register_frmr_external().
+ */
+static void
+frwr_op_reset(struct rpcrdma_xprt *r_xprt)
+{
+       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+       struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+       unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
+       struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+       struct rpcrdma_mw *r;
+       int rc;
+
+       list_for_each_entry(r, &buf->rb_all, mw_all) {
+               if (r->r.frmr.fr_state == FRMR_IS_INVALID)
+                       continue;
+
+               __frwr_release(r);
+               rc = __frwr_init(r, pd, device, depth);
+               if (rc) {
+                       dprintk("RPC:       %s: mw %p left %s\n",
+                               __func__, r,
+                               (r->r.frmr.fr_state == FRMR_IS_STALE ?
+                                       "stale" : "valid"));
+                       continue;
+               }
+
+               r->r.frmr.fr_state = FRMR_IS_INVALID;
+       }
+}
+
+static void
+frwr_op_destroy(struct rpcrdma_buffer *buf)
+{
+       struct rpcrdma_mw *r;
+
+       while (!list_empty(&buf->rb_all)) {
+               r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+               list_del(&r->mw_all);
+               __frwr_release(r);
+               kfree(r);
+       }
+}
+
+const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
+       .ro_map                         = frwr_op_map,
+       .ro_unmap                       = frwr_op_unmap,
+       .ro_open                        = frwr_op_open,
+       .ro_maxpages                    = frwr_op_maxpages,
+       .ro_init                        = frwr_op_init,
+       .ro_reset                       = frwr_op_reset,
+       .ro_destroy                     = frwr_op_destroy,
+       .ro_displayname                 = "frwr",
+};
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
new file mode 100644 (file)
index 0000000..ba518af
--- /dev/null
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2015 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* No-op chunk preparation. All client memory is pre-registered.
+ * Sometimes referred to as ALLPHYSICAL mode.
+ *
+ * Physical registration is simple because all client memory is
+ * pre-registered and never deregistered. This mode is good for
+ * adapter bring up, but is considered not safe: the server is
+ * trusted not to abuse its access to client memory not involved
+ * in RDMA I/O.
+ */
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY       RPCDBG_TRANS
+#endif
+
+static int
+physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
+                struct rpcrdma_create_data_internal *cdata)
+{
+       return 0;
+}
+
+/* PHYSICAL memory registration conveys one page per chunk segment.
+ */
+static size_t
+physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
+{
+       return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+                    rpcrdma_max_segments(r_xprt));
+}
+
+static int
+physical_op_init(struct rpcrdma_xprt *r_xprt)
+{
+       return 0;
+}
+
+/* The client's physical memory is already exposed for
+ * remote access via RDMA READ or RDMA WRITE.
+ */
+static int
+physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+               int nsegs, bool writing)
+{
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+       rpcrdma_map_one(ia->ri_id->device, seg,
+                       rpcrdma_data_dir(writing));
+       seg->mr_rkey = ia->ri_bind_mem->rkey;
+       seg->mr_base = seg->mr_dma;
+       seg->mr_nsegs = 1;
+       return 1;
+}
+
+/* Unmap a memory region, but leave it registered.
+ */
+static int
+physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+       read_lock(&ia->ri_qplock);
+       rpcrdma_unmap_one(ia->ri_id->device, seg);
+       read_unlock(&ia->ri_qplock);
+
+       return 1;
+}
+
+static void
+physical_op_reset(struct rpcrdma_xprt *r_xprt)
+{
+}
+
+static void
+physical_op_destroy(struct rpcrdma_buffer *buf)
+{
+}
+
+const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
+       .ro_map                         = physical_op_map,
+       .ro_unmap                       = physical_op_unmap,
+       .ro_open                        = physical_op_open,
+       .ro_maxpages                    = physical_op_maxpages,
+       .ro_init                        = physical_op_init,
+       .ro_reset                       = physical_op_reset,
+       .ro_destroy                     = physical_op_destroy,
+       .ro_displayname                 = "physical",
+};
index 91ffde82fa0c49eba3e11385aec9e334eb6b699a..2c53ea9e1b83dae01ebdd1aa22d256174dfbae08 100644 (file)
 # define RPCDBG_FACILITY       RPCDBG_TRANS
 #endif
 
+enum rpcrdma_chunktype {
+       rpcrdma_noch = 0,
+       rpcrdma_readch,
+       rpcrdma_areadch,
+       rpcrdma_writech,
+       rpcrdma_replych
+};
+
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 static const char transfertypes[][12] = {
        "pure inline",  /* no chunks */
@@ -179,6 +187,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
        struct rpcrdma_write_array *warray = NULL;
        struct rpcrdma_write_chunk *cur_wchunk = NULL;
        __be32 *iptr = headerp->rm_body.rm_chunks;
+       int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
 
        if (type == rpcrdma_readch || type == rpcrdma_areadch) {
                /* a read chunk - server will RDMA Read our memory */
@@ -201,9 +210,9 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
        if (nsegs < 0)
                return nsegs;
 
+       map = r_xprt->rx_ia.ri_ops->ro_map;
        do {
-               n = rpcrdma_register_external(seg, nsegs,
-                                               cur_wchunk != NULL, r_xprt);
+               n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
                if (n <= 0)
                        goto out;
                if (cur_rchunk) {       /* read */
@@ -275,34 +284,13 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
        return (unsigned char *)iptr - (unsigned char *)headerp;
 
 out:
-       if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) {
-               for (pos = 0; nchunks--;)
-                       pos += rpcrdma_deregister_external(
-                                       &req->rl_segments[pos], r_xprt);
-       }
-       return n;
-}
+       if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
+               return n;
 
-/*
- * Marshal chunks. This routine returns the header length
- * consumed by marshaling.
- *
- * Returns positive RPC/RDMA header size, or negative errno.
- */
-
-ssize_t
-rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)
-{
-       struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
-       struct rpcrdma_msg *headerp = rdmab_to_msg(req->rl_rdmabuf);
-
-       if (req->rl_rtype != rpcrdma_noch)
-               result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
-                                              headerp, req->rl_rtype);
-       else if (req->rl_wtype != rpcrdma_noch)
-               result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
-                                              headerp, req->rl_wtype);
-       return result;
+       for (pos = 0; nchunks--;)
+               pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
+                                                     &req->rl_segments[pos]);
+       return n;
 }
 
 /*
@@ -397,6 +385,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        char *base;
        size_t rpclen, padlen;
        ssize_t hdrlen;
+       enum rpcrdma_chunktype rtype, wtype;
        struct rpcrdma_msg *headerp;
 
        /*
@@ -433,13 +422,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
         * into pages; otherwise use reply chunks.
         */
        if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
-               req->rl_wtype = rpcrdma_noch;
+               wtype = rpcrdma_noch;
        else if (rqst->rq_rcv_buf.page_len == 0)
-               req->rl_wtype = rpcrdma_replych;
+               wtype = rpcrdma_replych;
        else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
-               req->rl_wtype = rpcrdma_writech;
+               wtype = rpcrdma_writech;
        else
-               req->rl_wtype = rpcrdma_replych;
+               wtype = rpcrdma_replych;
 
        /*
         * Chunks needed for arguments?
@@ -456,16 +445,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
         * TBD check NFSv4 setacl
         */
        if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
-               req->rl_rtype = rpcrdma_noch;
+               rtype = rpcrdma_noch;
        else if (rqst->rq_snd_buf.page_len == 0)
-               req->rl_rtype = rpcrdma_areadch;
+               rtype = rpcrdma_areadch;
        else
-               req->rl_rtype = rpcrdma_readch;
+               rtype = rpcrdma_readch;
 
        /* The following simplification is not true forever */
-       if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych)
-               req->rl_wtype = rpcrdma_noch;
-       if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) {
+       if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
+               wtype = rpcrdma_noch;
+       if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
                dprintk("RPC:       %s: cannot marshal multiple chunk lists\n",
                        __func__);
                return -EIO;
@@ -479,7 +468,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
         * When padding is in use and applies to the transfer, insert
         * it and change the message type.
         */
-       if (req->rl_rtype == rpcrdma_noch) {
+       if (rtype == rpcrdma_noch) {
 
                padlen = rpcrdma_inline_pullup(rqst,
                                                RPCRDMA_INLINE_PAD_VALUE(rqst));
@@ -494,7 +483,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
                        headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
                        headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
                        hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
-                       if (req->rl_wtype != rpcrdma_noch) {
+                       if (wtype != rpcrdma_noch) {
                                dprintk("RPC:       %s: invalid chunk list\n",
                                        __func__);
                                return -EIO;
@@ -515,18 +504,26 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
                         * on receive. Therefore, we request a reply chunk
                         * for non-writes wherever feasible and efficient.
                         */
-                       if (req->rl_wtype == rpcrdma_noch)
-                               req->rl_wtype = rpcrdma_replych;
+                       if (wtype == rpcrdma_noch)
+                               wtype = rpcrdma_replych;
                }
        }
 
-       hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen);
+       if (rtype != rpcrdma_noch) {
+               hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
+                                              headerp, rtype);
+               wtype = rtype;  /* simplify dprintk */
+
+       } else if (wtype != rpcrdma_noch) {
+               hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
+                                              headerp, wtype);
+       }
        if (hdrlen < 0)
                return hdrlen;
 
        dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd"
                " headerp 0x%p base 0x%p lkey 0x%x\n",
-               __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen,
+               __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
                headerp, base, rdmab_lkey(req->rl_rdmabuf));
 
        /*
index 2e192baa59f3d83841746a4ea5769d8680a5f039..54f23b1be98617d5c29fb482a221ad6ed1391876 100644 (file)
@@ -156,13 +156,48 @@ static struct ctl_table sunrpc_table[] = {
 
 static struct rpc_xprt_ops xprt_rdma_procs;    /* forward reference */
 
+static void
+xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
+{
+       struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+       char buf[20];
+
+       snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
+       xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+       xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA;
+}
+
+static void
+xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
+{
+       struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+       char buf[40];
+
+       snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr);
+       xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+       xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6;
+}
+
 static void
 xprt_rdma_format_addresses(struct rpc_xprt *xprt)
 {
        struct sockaddr *sap = (struct sockaddr *)
                                        &rpcx_to_rdmad(xprt).addr;
-       struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-       char buf[64];
+       char buf[128];
+
+       switch (sap->sa_family) {
+       case AF_INET:
+               xprt_rdma_format_addresses4(xprt, sap);
+               break;
+       case AF_INET6:
+               xprt_rdma_format_addresses6(xprt, sap);
+               break;
+       default:
+               pr_err("rpcrdma: Unrecognized address family\n");
+               return;
+       }
 
        (void)rpc_ntop(sap, buf, sizeof(buf));
        xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL);
@@ -170,16 +205,10 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt)
        snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap));
        xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
 
-       xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
-
-       snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
-       xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
-
        snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap));
        xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
 
-       /* netid */
-       xprt->address_strings[RPC_DISPLAY_NETID] = "rdma";
+       xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
 }
 
 static void
@@ -377,7 +406,10 @@ xprt_setup_rdma(struct xprt_create *args)
                          xprt_rdma_connect_worker);
 
        xprt_rdma_format_addresses(xprt);
-       xprt->max_payload = rpcrdma_max_payload(new_xprt);
+       xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
+       if (xprt->max_payload == 0)
+               goto out4;
+       xprt->max_payload <<= PAGE_SHIFT;
        dprintk("RPC:       %s: transport data payload maximum: %zu bytes\n",
                __func__, xprt->max_payload);
 
@@ -552,8 +584,8 @@ xprt_rdma_free(void *buffer)
 
        for (i = 0; req->rl_nchunks;) {
                --req->rl_nchunks;
-               i += rpcrdma_deregister_external(
-                       &req->rl_segments[i], r_xprt);
+               i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
+                                                   &req->rl_segments[i]);
        }
 
        rpcrdma_buffer_put(req);
@@ -579,10 +611,7 @@ xprt_rdma_send_request(struct rpc_task *task)
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
        int rc = 0;
 
-       if (req->rl_niovs == 0)
-               rc = rpcrdma_marshal_req(rqst);
-       else if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_ALLPHYSICAL)
-               rc = rpcrdma_marshal_chunks(rqst, 0);
+       rc = rpcrdma_marshal_req(rqst);
        if (rc < 0)
                goto failed_marshal;
 
index e28909fddd30a3c273e4cfcbf5e600d005e916b2..4870d272e0067140dbdc7f16a914ee99dbabe9fc 100644 (file)
@@ -50,6 +50,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/prefetch.h>
+#include <linux/sunrpc/addr.h>
 #include <asm/bitops.h>
 
 #include "xprt_rdma.h"
@@ -62,9 +63,6 @@
 # define RPCDBG_FACILITY       RPCDBG_TRANS
 #endif
 
-static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
-static void rpcrdma_reset_fmrs(struct rpcrdma_ia *);
-
 /*
  * internal functions
  */
@@ -188,7 +186,7 @@ static const char * const wc_status[] = {
        "remote access error",
        "remote operation error",
        "transport retry counter exceeded",
-       "RNR retrycounter exceeded",
+       "RNR retry counter exceeded",
        "local RDD violation error",
        "remove invalid RD request",
        "operation aborted",
@@ -206,21 +204,17 @@ static const char * const wc_status[] = {
 static void
 rpcrdma_sendcq_process_wc(struct ib_wc *wc)
 {
-       if (likely(wc->status == IB_WC_SUCCESS))
-               return;
-
        /* WARNING: Only wr_id and status are reliable at this point */
-       if (wc->wr_id == 0ULL) {
-               if (wc->status != IB_WC_WR_FLUSH_ERR)
+       if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) {
+               if (wc->status != IB_WC_SUCCESS &&
+                   wc->status != IB_WC_WR_FLUSH_ERR)
                        pr_err("RPC:       %s: SEND: %s\n",
                               __func__, COMPLETION_MSG(wc->status));
        } else {
                struct rpcrdma_mw *r;
 
                r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
-               r->r.frmr.fr_state = FRMR_IS_STALE;
-               pr_err("RPC:       %s: frmr %p (stale): %s\n",
-                      __func__, r, COMPLETION_MSG(wc->status));
+               r->mw_sendcompletion(wc);
        }
 }
 
@@ -424,7 +418,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
        struct rpcrdma_ia *ia = &xprt->rx_ia;
        struct rpcrdma_ep *ep = &xprt->rx_ep;
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-       struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
+       struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
 #endif
        struct ib_qp_attr *attr = &ia->ri_qp_attr;
        struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
@@ -480,9 +474,8 @@ connected:
                wake_up_all(&ep->rep_connect_wait);
                /*FALLTHROUGH*/
        default:
-               dprintk("RPC:       %s: %pI4:%u (ep 0x%p): %s\n",
-                       __func__, &addr->sin_addr.s_addr,
-                       ntohs(addr->sin_port), ep,
+               dprintk("RPC:       %s: %pIS:%u (ep 0x%p): %s\n",
+                       __func__, sap, rpc_get_port(sap), ep,
                        CONNECTION_MSG(event->event));
                break;
        }
@@ -491,19 +484,16 @@ connected:
        if (connstate == 1) {
                int ird = attr->max_dest_rd_atomic;
                int tird = ep->rep_remote_cma.responder_resources;
-               printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
-                       "on %s, memreg %d slots %d ird %d%s\n",
-                       &addr->sin_addr.s_addr,
-                       ntohs(addr->sin_port),
+
+               pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
+                       sap, rpc_get_port(sap),
                        ia->ri_id->device->name,
-                       ia->ri_memreg_strategy,
+                       ia->ri_ops->ro_displayname,
                        xprt->rx_buf.rb_max_requests,
                        ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
        } else if (connstate < 0) {
-               printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
-                       &addr->sin_addr.s_addr,
-                       ntohs(addr->sin_port),
-                       connstate);
+               pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
+                       sap, rpc_get_port(sap), connstate);
        }
 #endif
 
@@ -621,17 +611,13 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 
        if (memreg == RPCRDMA_FRMR) {
                /* Requires both frmr reg and local dma lkey */
-               if ((devattr->device_cap_flags &
+               if (((devattr->device_cap_flags &
                     (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
-                   (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
+                   (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) ||
+                     (devattr->max_fast_reg_page_list_len == 0)) {
                        dprintk("RPC:       %s: FRMR registration "
                                "not supported by HCA\n", __func__);
                        memreg = RPCRDMA_MTHCAFMR;
-               } else {
-                       /* Mind the ia limit on FRMR page list depth */
-                       ia->ri_max_frmr_depth = min_t(unsigned int,
-                               RPCRDMA_MAX_DATA_SEGS,
-                               devattr->max_fast_reg_page_list_len);
                }
        }
        if (memreg == RPCRDMA_MTHCAFMR) {
@@ -652,13 +638,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
         */
        switch (memreg) {
        case RPCRDMA_FRMR:
+               ia->ri_ops = &rpcrdma_frwr_memreg_ops;
                break;
        case RPCRDMA_ALLPHYSICAL:
+               ia->ri_ops = &rpcrdma_physical_memreg_ops;
                mem_priv = IB_ACCESS_LOCAL_WRITE |
                                IB_ACCESS_REMOTE_WRITE |
                                IB_ACCESS_REMOTE_READ;
                goto register_setup;
        case RPCRDMA_MTHCAFMR:
+               ia->ri_ops = &rpcrdma_fmr_memreg_ops;
                if (ia->ri_have_dma_lkey)
                        break;
                mem_priv = IB_ACCESS_LOCAL_WRITE;
@@ -678,8 +667,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
                rc = -ENOMEM;
                goto out3;
        }
-       dprintk("RPC:       %s: memory registration strategy is %d\n",
-               __func__, memreg);
+       dprintk("RPC:       %s: memory registration strategy is '%s'\n",
+               __func__, ia->ri_ops->ro_displayname);
 
        /* Else will do memory reg/dereg for each chunk */
        ia->ri_memreg_strategy = memreg;
@@ -743,49 +732,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 
        ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
        ep->rep_attr.qp_context = ep;
-       /* send_cq and recv_cq initialized below */
        ep->rep_attr.srq = NULL;
        ep->rep_attr.cap.max_send_wr = cdata->max_requests;
-       switch (ia->ri_memreg_strategy) {
-       case RPCRDMA_FRMR: {
-               int depth = 7;
-
-               /* Add room for frmr register and invalidate WRs.
-                * 1. FRMR reg WR for head
-                * 2. FRMR invalidate WR for head
-                * 3. N FRMR reg WRs for pagelist
-                * 4. N FRMR invalidate WRs for pagelist
-                * 5. FRMR reg WR for tail
-                * 6. FRMR invalidate WR for tail
-                * 7. The RDMA_SEND WR
-                */
-
-               /* Calculate N if the device max FRMR depth is smaller than
-                * RPCRDMA_MAX_DATA_SEGS.
-                */
-               if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
-                       int delta = RPCRDMA_MAX_DATA_SEGS -
-                                   ia->ri_max_frmr_depth;
-
-                       do {
-                               depth += 2; /* FRMR reg + invalidate */
-                               delta -= ia->ri_max_frmr_depth;
-                       } while (delta > 0);
-
-               }
-               ep->rep_attr.cap.max_send_wr *= depth;
-               if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
-                       cdata->max_requests = devattr->max_qp_wr / depth;
-                       if (!cdata->max_requests)
-                               return -EINVAL;
-                       ep->rep_attr.cap.max_send_wr = cdata->max_requests *
-                                                      depth;
-               }
-               break;
-       }
-       default:
-               break;
-       }
+       rc = ia->ri_ops->ro_open(ia, ep, cdata);
+       if (rc)
+               return rc;
        ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
        ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
        ep->rep_attr.cap.max_recv_sge = 1;
@@ -944,21 +895,9 @@ retry:
                rpcrdma_ep_disconnect(ep, ia);
                rpcrdma_flush_cqs(ep);
 
-               switch (ia->ri_memreg_strategy) {
-               case RPCRDMA_FRMR:
-                       rpcrdma_reset_frmrs(ia);
-                       break;
-               case RPCRDMA_MTHCAFMR:
-                       rpcrdma_reset_fmrs(ia);
-                       break;
-               case RPCRDMA_ALLPHYSICAL:
-                       break;
-               default:
-                       rc = -EIO;
-                       goto out;
-               }
-
                xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
+               ia->ri_ops->ro_reset(xprt);
+
                id = rpcrdma_create_id(xprt, ia,
                                (struct sockaddr *)&xprt->rx_data.addr);
                if (IS_ERR(id)) {
@@ -1123,91 +1062,6 @@ out:
        return ERR_PTR(rc);
 }
 
-static int
-rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
-{
-       int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
-       struct ib_fmr_attr fmr_attr = {
-               .max_pages      = RPCRDMA_MAX_DATA_SEGS,
-               .max_maps       = 1,
-               .page_shift     = PAGE_SHIFT
-       };
-       struct rpcrdma_mw *r;
-       int i, rc;
-
-       i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
-       dprintk("RPC:       %s: initializing %d FMRs\n", __func__, i);
-
-       while (i--) {
-               r = kzalloc(sizeof(*r), GFP_KERNEL);
-               if (r == NULL)
-                       return -ENOMEM;
-
-               r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
-               if (IS_ERR(r->r.fmr)) {
-                       rc = PTR_ERR(r->r.fmr);
-                       dprintk("RPC:       %s: ib_alloc_fmr failed %i\n",
-                               __func__, rc);
-                       goto out_free;
-               }
-
-               list_add(&r->mw_list, &buf->rb_mws);
-               list_add(&r->mw_all, &buf->rb_all);
-       }
-       return 0;
-
-out_free:
-       kfree(r);
-       return rc;
-}
-
-static int
-rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
-{
-       struct rpcrdma_frmr *f;
-       struct rpcrdma_mw *r;
-       int i, rc;
-
-       i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
-       dprintk("RPC:       %s: initializing %d FRMRs\n", __func__, i);
-
-       while (i--) {
-               r = kzalloc(sizeof(*r), GFP_KERNEL);
-               if (r == NULL)
-                       return -ENOMEM;
-               f = &r->r.frmr;
-
-               f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
-                                               ia->ri_max_frmr_depth);
-               if (IS_ERR(f->fr_mr)) {
-                       rc = PTR_ERR(f->fr_mr);
-                       dprintk("RPC:       %s: ib_alloc_fast_reg_mr "
-                               "failed %i\n", __func__, rc);
-                       goto out_free;
-               }
-
-               f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
-                                                       ia->ri_max_frmr_depth);
-               if (IS_ERR(f->fr_pgl)) {
-                       rc = PTR_ERR(f->fr_pgl);
-                       dprintk("RPC:       %s: ib_alloc_fast_reg_page_list "
-                               "failed %i\n", __func__, rc);
-
-                       ib_dereg_mr(f->fr_mr);
-                       goto out_free;
-               }
-
-               list_add(&r->mw_list, &buf->rb_mws);
-               list_add(&r->mw_all, &buf->rb_all);
-       }
-
-       return 0;
-
-out_free:
-       kfree(r);
-       return rc;
-}
-
 int
 rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 {
@@ -1244,22 +1098,9 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
        buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
        p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
 
-       INIT_LIST_HEAD(&buf->rb_mws);
-       INIT_LIST_HEAD(&buf->rb_all);
-       switch (ia->ri_memreg_strategy) {
-       case RPCRDMA_FRMR:
-               rc = rpcrdma_init_frmrs(ia, buf);
-               if (rc)
-                       goto out;
-               break;
-       case RPCRDMA_MTHCAFMR:
-               rc = rpcrdma_init_fmrs(ia, buf);
-               if (rc)
-                       goto out;
-               break;
-       default:
-               break;
-       }
+       rc = ia->ri_ops->ro_init(r_xprt);
+       if (rc)
+               goto out;
 
        for (i = 0; i < buf->rb_max_requests; i++) {
                struct rpcrdma_req *req;
@@ -1311,47 +1152,6 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
        kfree(req);
 }
 
-static void
-rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
-{
-       struct rpcrdma_mw *r;
-       int rc;
-
-       while (!list_empty(&buf->rb_all)) {
-               r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
-               list_del(&r->mw_all);
-               list_del(&r->mw_list);
-
-               rc = ib_dealloc_fmr(r->r.fmr);
-               if (rc)
-                       dprintk("RPC:       %s: ib_dealloc_fmr failed %i\n",
-                               __func__, rc);
-
-               kfree(r);
-       }
-}
-
-static void
-rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
-{
-       struct rpcrdma_mw *r;
-       int rc;
-
-       while (!list_empty(&buf->rb_all)) {
-               r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
-               list_del(&r->mw_all);
-               list_del(&r->mw_list);
-
-               rc = ib_dereg_mr(r->r.frmr.fr_mr);
-               if (rc)
-                       dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
-                               __func__, rc);
-               ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
-
-               kfree(r);
-       }
-}
-
 void
 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 {
@@ -1372,104 +1172,11 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
                        rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
        }
 
-       switch (ia->ri_memreg_strategy) {
-       case RPCRDMA_FRMR:
-               rpcrdma_destroy_frmrs(buf);
-               break;
-       case RPCRDMA_MTHCAFMR:
-               rpcrdma_destroy_fmrs(buf);
-               break;
-       default:
-               break;
-       }
+       ia->ri_ops->ro_destroy(buf);
 
        kfree(buf->rb_pool);
 }
 
-/* After a disconnect, unmap all FMRs.
- *
- * This is invoked only in the transport connect worker in order
- * to serialize with rpcrdma_register_fmr_external().
- */
-static void
-rpcrdma_reset_fmrs(struct rpcrdma_ia *ia)
-{
-       struct rpcrdma_xprt *r_xprt =
-                               container_of(ia, struct rpcrdma_xprt, rx_ia);
-       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-       struct list_head *pos;
-       struct rpcrdma_mw *r;
-       LIST_HEAD(l);
-       int rc;
-
-       list_for_each(pos, &buf->rb_all) {
-               r = list_entry(pos, struct rpcrdma_mw, mw_all);
-
-               INIT_LIST_HEAD(&l);
-               list_add(&r->r.fmr->list, &l);
-               rc = ib_unmap_fmr(&l);
-               if (rc)
-                       dprintk("RPC:       %s: ib_unmap_fmr failed %i\n",
-                               __func__, rc);
-       }
-}
-
-/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
- * an unusable state. Find FRMRs in this state and dereg / reg
- * each.  FRMRs that are VALID and attached to an rpcrdma_req are
- * also torn down.
- *
- * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
- *
- * This is invoked only in the transport connect worker in order
- * to serialize with rpcrdma_register_frmr_external().
- */
-static void
-rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
-{
-       struct rpcrdma_xprt *r_xprt =
-                               container_of(ia, struct rpcrdma_xprt, rx_ia);
-       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-       struct list_head *pos;
-       struct rpcrdma_mw *r;
-       int rc;
-
-       list_for_each(pos, &buf->rb_all) {
-               r = list_entry(pos, struct rpcrdma_mw, mw_all);
-
-               if (r->r.frmr.fr_state == FRMR_IS_INVALID)
-                       continue;
-
-               rc = ib_dereg_mr(r->r.frmr.fr_mr);
-               if (rc)
-                       dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
-                               __func__, rc);
-               ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
-
-               r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
-                                       ia->ri_max_frmr_depth);
-               if (IS_ERR(r->r.frmr.fr_mr)) {
-                       rc = PTR_ERR(r->r.frmr.fr_mr);
-                       dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
-                               " failed %i\n", __func__, rc);
-                       continue;
-               }
-               r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
-                                       ia->ri_id->device,
-                                       ia->ri_max_frmr_depth);
-               if (IS_ERR(r->r.frmr.fr_pgl)) {
-                       rc = PTR_ERR(r->r.frmr.fr_pgl);
-                       dprintk("RPC:       %s: "
-                               "ib_alloc_fast_reg_page_list "
-                               "failed %i\n", __func__, rc);
-
-                       ib_dereg_mr(r->r.frmr.fr_mr);
-                       continue;
-               }
-               r->r.frmr.fr_state = FRMR_IS_INVALID;
-       }
-}
-
 /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
  * some req segments uninitialized.
  */
@@ -1509,7 +1216,7 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
        }
 }
 
-/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
+/* rpcrdma_unmap_one() was already done during deregistration.
  * Redo only the ib_post_send().
  */
 static void
@@ -1729,6 +1436,14 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
  * Wrappers for internal-use kmalloc memory registration, used by buffer code.
  */
 
+void
+rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
+{
+       dprintk("RPC:       map_one: offset %p iova %llx len %zu\n",
+               seg->mr_offset,
+               (unsigned long long)seg->mr_dma, seg->mr_dmalen);
+}
+
 static int
 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
                                struct ib_mr **mrp, struct ib_sge *iov)
@@ -1853,287 +1568,6 @@ rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
        }
 }
 
-/*
- * Wrappers for chunk registration, shared by read/write chunk code.
- */
-
-static void
-rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
-{
-       seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
-       seg->mr_dmalen = seg->mr_len;
-       if (seg->mr_page)
-               seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
-                               seg->mr_page, offset_in_page(seg->mr_offset),
-                               seg->mr_dmalen, seg->mr_dir);
-       else
-               seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
-                               seg->mr_offset,
-                               seg->mr_dmalen, seg->mr_dir);
-       if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
-               dprintk("RPC:       %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
-                       __func__,
-                       (unsigned long long)seg->mr_dma,
-                       seg->mr_offset, seg->mr_dmalen);
-       }
-}
-
-static void
-rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
-{
-       if (seg->mr_page)
-               ib_dma_unmap_page(ia->ri_id->device,
-                               seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
-       else
-               ib_dma_unmap_single(ia->ri_id->device,
-                               seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
-}
-
-static int
-rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
-                       int *nsegs, int writing, struct rpcrdma_ia *ia,
-                       struct rpcrdma_xprt *r_xprt)
-{
-       struct rpcrdma_mr_seg *seg1 = seg;
-       struct rpcrdma_mw *mw = seg1->rl_mw;
-       struct rpcrdma_frmr *frmr = &mw->r.frmr;
-       struct ib_mr *mr = frmr->fr_mr;
-       struct ib_send_wr fastreg_wr, *bad_wr;
-       u8 key;
-       int len, pageoff;
-       int i, rc;
-       int seg_len;
-       u64 pa;
-       int page_no;
-
-       pageoff = offset_in_page(seg1->mr_offset);
-       seg1->mr_offset -= pageoff;     /* start of page */
-       seg1->mr_len += pageoff;
-       len = -pageoff;
-       if (*nsegs > ia->ri_max_frmr_depth)
-               *nsegs = ia->ri_max_frmr_depth;
-       for (page_no = i = 0; i < *nsegs;) {
-               rpcrdma_map_one(ia, seg, writing);
-               pa = seg->mr_dma;
-               for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
-                       frmr->fr_pgl->page_list[page_no++] = pa;
-                       pa += PAGE_SIZE;
-               }
-               len += seg->mr_len;
-               ++seg;
-               ++i;
-               /* Check for holes */
-               if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
-                   offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
-                       break;
-       }
-       dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
-               __func__, mw, i);
-
-       frmr->fr_state = FRMR_IS_VALID;
-
-       memset(&fastreg_wr, 0, sizeof(fastreg_wr));
-       fastreg_wr.wr_id = (unsigned long)(void *)mw;
-       fastreg_wr.opcode = IB_WR_FAST_REG_MR;
-       fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
-       fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
-       fastreg_wr.wr.fast_reg.page_list_len = page_no;
-       fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
-       fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
-       if (fastreg_wr.wr.fast_reg.length < len) {
-               rc = -EIO;
-               goto out_err;
-       }
-
-       /* Bump the key */
-       key = (u8)(mr->rkey & 0x000000FF);
-       ib_update_fast_reg_key(mr, ++key);
-
-       fastreg_wr.wr.fast_reg.access_flags = (writing ?
-                               IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
-                               IB_ACCESS_REMOTE_READ);
-       fastreg_wr.wr.fast_reg.rkey = mr->rkey;
-       DECR_CQCOUNT(&r_xprt->rx_ep);
-
-       rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
-       if (rc) {
-               dprintk("RPC:       %s: failed ib_post_send for register,"
-                       " status %i\n", __func__, rc);
-               ib_update_fast_reg_key(mr, --key);
-               goto out_err;
-       } else {
-               seg1->mr_rkey = mr->rkey;
-               seg1->mr_base = seg1->mr_dma + pageoff;
-               seg1->mr_nsegs = i;
-               seg1->mr_len = len;
-       }
-       *nsegs = i;
-       return 0;
-out_err:
-       frmr->fr_state = FRMR_IS_INVALID;
-       while (i--)
-               rpcrdma_unmap_one(ia, --seg);
-       return rc;
-}
-
-static int
-rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
-                       struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
-{
-       struct rpcrdma_mr_seg *seg1 = seg;
-       struct ib_send_wr invalidate_wr, *bad_wr;
-       int rc;
-
-       seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
-
-       memset(&invalidate_wr, 0, sizeof invalidate_wr);
-       invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
-       invalidate_wr.opcode = IB_WR_LOCAL_INV;
-       invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
-       DECR_CQCOUNT(&r_xprt->rx_ep);
-
-       read_lock(&ia->ri_qplock);
-       while (seg1->mr_nsegs--)
-               rpcrdma_unmap_one(ia, seg++);
-       rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
-       read_unlock(&ia->ri_qplock);
-       if (rc) {
-               /* Force rpcrdma_buffer_get() to retry */
-               seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
-               dprintk("RPC:       %s: failed ib_post_send for invalidate,"
-                       " status %i\n", __func__, rc);
-       }
-       return rc;
-}
-
-static int
-rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
-                       int *nsegs, int writing, struct rpcrdma_ia *ia)
-{
-       struct rpcrdma_mr_seg *seg1 = seg;
-       u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
-       int len, pageoff, i, rc;
-
-       pageoff = offset_in_page(seg1->mr_offset);
-       seg1->mr_offset -= pageoff;     /* start of page */
-       seg1->mr_len += pageoff;
-       len = -pageoff;
-       if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
-               *nsegs = RPCRDMA_MAX_DATA_SEGS;
-       for (i = 0; i < *nsegs;) {
-               rpcrdma_map_one(ia, seg, writing);
-               physaddrs[i] = seg->mr_dma;
-               len += seg->mr_len;
-               ++seg;
-               ++i;
-               /* Check for holes */
-               if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
-                   offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
-                       break;
-       }
-       rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma);
-       if (rc) {
-               dprintk("RPC:       %s: failed ib_map_phys_fmr "
-                       "%u@0x%llx+%i (%d)... status %i\n", __func__,
-                       len, (unsigned long long)seg1->mr_dma,
-                       pageoff, i, rc);
-               while (i--)
-                       rpcrdma_unmap_one(ia, --seg);
-       } else {
-               seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey;
-               seg1->mr_base = seg1->mr_dma + pageoff;
-               seg1->mr_nsegs = i;
-               seg1->mr_len = len;
-       }
-       *nsegs = i;
-       return rc;
-}
-
-static int
-rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
-                       struct rpcrdma_ia *ia)
-{
-       struct rpcrdma_mr_seg *seg1 = seg;
-       LIST_HEAD(l);
-       int rc;
-
-       list_add(&seg1->rl_mw->r.fmr->list, &l);
-       rc = ib_unmap_fmr(&l);
-       read_lock(&ia->ri_qplock);
-       while (seg1->mr_nsegs--)
-               rpcrdma_unmap_one(ia, seg++);
-       read_unlock(&ia->ri_qplock);
-       if (rc)
-               dprintk("RPC:       %s: failed ib_unmap_fmr,"
-                       " status %i\n", __func__, rc);
-       return rc;
-}
-
-int
-rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
-                       int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
-{
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       int rc = 0;
-
-       switch (ia->ri_memreg_strategy) {
-
-       case RPCRDMA_ALLPHYSICAL:
-               rpcrdma_map_one(ia, seg, writing);
-               seg->mr_rkey = ia->ri_bind_mem->rkey;
-               seg->mr_base = seg->mr_dma;
-               seg->mr_nsegs = 1;
-               nsegs = 1;
-               break;
-
-       /* Registration using frmr registration */
-       case RPCRDMA_FRMR:
-               rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
-               break;
-
-       /* Registration using fmr memory registration */
-       case RPCRDMA_MTHCAFMR:
-               rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
-               break;
-
-       default:
-               return -EIO;
-       }
-       if (rc)
-               return rc;
-
-       return nsegs;
-}
-
-int
-rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
-               struct rpcrdma_xprt *r_xprt)
-{
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       int nsegs = seg->mr_nsegs, rc;
-
-       switch (ia->ri_memreg_strategy) {
-
-       case RPCRDMA_ALLPHYSICAL:
-               read_lock(&ia->ri_qplock);
-               rpcrdma_unmap_one(ia, seg);
-               read_unlock(&ia->ri_qplock);
-               break;
-
-       case RPCRDMA_FRMR:
-               rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
-               break;
-
-       case RPCRDMA_MTHCAFMR:
-               rc = rpcrdma_deregister_fmr_external(seg, ia);
-               break;
-
-       default:
-               break;
-       }
-       return nsegs;
-}
-
 /*
  * Prepost any receive buffer, then post send.
  *
@@ -2156,7 +1590,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
        }
 
        send_wr.next = NULL;
-       send_wr.wr_id = 0ULL;   /* no send cookie */
+       send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
        send_wr.sg_list = req->rl_send_iov;
        send_wr.num_sge = req->rl_niovs;
        send_wr.opcode = IB_WR_SEND;
@@ -2215,43 +1649,24 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
        return rc;
 }
 
-/* Physical mapping means one Read/Write list entry per-page.
- * All list entries must fit within an inline buffer
- *
- * NB: The server must return a Write list for NFS READ,
- *     which has the same constraint. Factor in the inline
- *     rsize as well.
+/* How many chunk list items fit within our inline buffers?
  */
-static size_t
-rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
+unsigned int
+rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
 {
        struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
-       unsigned int inline_size, pages;
+       int bytes, segments;
 
-       inline_size = min_t(unsigned int,
-                           cdata->inline_wsize, cdata->inline_rsize);
-       inline_size -= RPCRDMA_HDRLEN_MIN;
-       pages = inline_size / sizeof(struct rpcrdma_segment);
-       return pages << PAGE_SHIFT;
-}
-
-static size_t
-rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
-{
-       return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
-}
-
-size_t
-rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
-{
-       size_t result;
-
-       switch (r_xprt->rx_ia.ri_memreg_strategy) {
-       case RPCRDMA_ALLPHYSICAL:
-               result = rpcrdma_physical_max_payload(r_xprt);
-               break;
-       default:
-               result = rpcrdma_mr_max_payload(r_xprt);
+       bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
+       bytes -= RPCRDMA_HDRLEN_MIN;
+       if (bytes < sizeof(struct rpcrdma_segment) * 2) {
+               pr_warn("RPC:       %s: inline threshold too small\n",
+                       __func__);
+               return 0;
        }
-       return result;
+
+       segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
+       dprintk("RPC:       %s: max chunk list size = %d segments\n",
+               __func__, segments);
+       return segments;
 }
index 0a16fb6f088590c142232ad99b2ac3b29304c169..78e0b8beaa363f4ea8a2e2c1073bf5d5c0a92b45 100644 (file)
@@ -60,6 +60,7 @@
  * Interface Adapter -- one per transport instance
  */
 struct rpcrdma_ia {
+       const struct rpcrdma_memreg_ops *ri_ops;
        rwlock_t                ri_qplock;
        struct rdma_cm_id       *ri_id;
        struct ib_pd            *ri_pd;
@@ -105,6 +106,10 @@ struct rpcrdma_ep {
 #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
 #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
 
+/* Force completion handler to ignore the signal
+ */
+#define RPCRDMA_IGNORE_COMPLETION      (0ULL)
+
 /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
  *
  * The below structure appears at the front of a large region of kmalloc'd
@@ -143,14 +148,6 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
        return (struct rpcrdma_msg *)rb->rg_base;
 }
 
-enum rpcrdma_chunktype {
-       rpcrdma_noch = 0,
-       rpcrdma_readch,
-       rpcrdma_areadch,
-       rpcrdma_writech,
-       rpcrdma_replych
-};
-
 /*
  * struct rpcrdma_rep -- this structure encapsulates state required to recv
  * and complete a reply, asychronously. It needs several pieces of
@@ -213,6 +210,7 @@ struct rpcrdma_mw {
                struct ib_fmr           *fmr;
                struct rpcrdma_frmr     frmr;
        } r;
+       void                    (*mw_sendcompletion)(struct ib_wc *);
        struct list_head        mw_list;
        struct list_head        mw_all;
 };
@@ -258,7 +256,6 @@ struct rpcrdma_req {
        unsigned int    rl_niovs;       /* 0, 2 or 4 */
        unsigned int    rl_nchunks;     /* non-zero if chunks */
        unsigned int    rl_connect_cookie;      /* retry detection */
-       enum rpcrdma_chunktype  rl_rtype, rl_wtype;
        struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
        struct rpcrdma_rep      *rl_reply;/* holder for reply buffer */
        struct ib_sge   rl_send_iov[4]; /* for active requests */
@@ -339,6 +336,29 @@ struct rpcrdma_stats {
        unsigned long           bad_reply_count;
 };
 
+/*
+ * Per-registration mode operations
+ */
+struct rpcrdma_xprt;
+struct rpcrdma_memreg_ops {
+       int             (*ro_map)(struct rpcrdma_xprt *,
+                                 struct rpcrdma_mr_seg *, int, bool);
+       int             (*ro_unmap)(struct rpcrdma_xprt *,
+                                   struct rpcrdma_mr_seg *);
+       int             (*ro_open)(struct rpcrdma_ia *,
+                                  struct rpcrdma_ep *,
+                                  struct rpcrdma_create_data_internal *);
+       size_t          (*ro_maxpages)(struct rpcrdma_xprt *);
+       int             (*ro_init)(struct rpcrdma_xprt *);
+       void            (*ro_reset)(struct rpcrdma_xprt *);
+       void            (*ro_destroy)(struct rpcrdma_buffer *);
+       const char      *ro_displayname;
+};
+
+extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
+extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
+extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops;
+
 /*
  * RPCRDMA transport -- encapsulates the structures above for
  * integration with RPC.
@@ -398,16 +418,56 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
 void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
 void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
 
-int rpcrdma_register_external(struct rpcrdma_mr_seg *,
-                               int, int, struct rpcrdma_xprt *);
-int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
-                               struct rpcrdma_xprt *);
-
 struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
                                            size_t, gfp_t);
 void rpcrdma_free_regbuf(struct rpcrdma_ia *,
                         struct rpcrdma_regbuf *);
 
+unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
+
+/*
+ * Wrappers for chunk registration, shared by read/write chunk code.
+ */
+
+void rpcrdma_mapping_error(struct rpcrdma_mr_seg *);
+
+static inline enum dma_data_direction
+rpcrdma_data_dir(bool writing)
+{
+       return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+}
+
+static inline void
+rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg,
+               enum dma_data_direction direction)
+{
+       seg->mr_dir = direction;
+       seg->mr_dmalen = seg->mr_len;
+
+       if (seg->mr_page)
+               seg->mr_dma = ib_dma_map_page(device,
+                               seg->mr_page, offset_in_page(seg->mr_offset),
+                               seg->mr_dmalen, seg->mr_dir);
+       else
+               seg->mr_dma = ib_dma_map_single(device,
+                               seg->mr_offset,
+                               seg->mr_dmalen, seg->mr_dir);
+
+       if (ib_dma_mapping_error(device, seg->mr_dma))
+               rpcrdma_mapping_error(seg);
+}
+
+static inline void
+rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg)
+{
+       if (seg->mr_page)
+               ib_dma_unmap_page(device,
+                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+       else
+               ib_dma_unmap_single(device,
+                                   seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+}
+
 /*
  * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
  */
@@ -418,9 +478,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
 /*
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
  */
-ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t);
 int rpcrdma_marshal_req(struct rpc_rqst *);
-size_t rpcrdma_max_payload(struct rpcrdma_xprt *);
 
 /* Temporary NFS request map cache. Created in svc_rdma.c  */
 extern struct kmem_cache *svc_rdma_map_cachep;