Merge tag 'nfs-for-4.2-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 2 Jul 2015 18:32:23 +0000 (11:32 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 2 Jul 2015 18:32:23 +0000 (11:32 -0700)
Pull NFS client updates from Trond Myklebust:
 "Highlights include:

  Stable patches:
   - Fix a crash in the NFSv4 file locking code.
   - Fix an fsync() regression, where we were failing to retry I/O in
     some circumstances.
   - Fix an infinite loop in NFSv4.0 OPEN stateid recovery
   - Fix a memory leak when an attempted pnfs fails.
   - Fix a memory leak in the backchannel code
   - Large hostnames were not supported correctly in NFSv4.1
   - Fix a pNFS/flexfiles bug that was impeding error reporting on I/O.
   - Fix a couple of credential issues in pNFS/flexfiles

  Bugfixes + cleanups:
   - Open flag sanity checks in the NFSv4 atomic open codepath
   - More NFSv4 delegation related bugfixes
   - Various NFSv4.1 backchannel bugfixes and cleanups
   - Fix the NFS swap socket code
   - Various cleanups of the NFSv4 SETCLIENTID and EXCHANGE_ID code
   - Fix a UDP transport deadlock issue

  Features:
   - More RDMA client transport improvements
   - NFSv4.2 LAYOUTSTATS functionality for pnfs flexfiles"

* tag 'nfs-for-4.2-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (87 commits)
  nfs: Remove invalid tk_pid from debug message
  nfs: Remove invalid NFS_ATTR_FATTR_V4_REFERRAL checking in nfs4_get_rootfh
  nfs: Drop bad comment in nfs41_walk_client_list()
  nfs: Remove unneeded micro checking of CONFIG_PROC_FS
  nfs: Don't setting FILE_CREATED flags always
  nfs: Use remove_proc_subtree() instead remove_proc_entry()
  nfs: Remove unused argument in nfs_server_set_fsinfo()
  nfs: Fix a memory leak when meeting an unsupported state protect
  nfs: take extra reference to fl->fl_file when running a LOCKU operation
  NFSv4: When returning a delegation, don't reclaim an incompatible open mode.
  NFSv4.2: LAYOUTSTATS is optional to implement
  NFSv4.2: Fix up a decoding error in layoutstats
  pNFS/flexfiles: Fix the reset of struct pgio_header when resending
  pNFS/flexfiles: Turn off layoutcommit for servers that don't need it
  pnfs/flexfiles: protect ktime manipulation with mirror lock
  nfs: provide pnfs_report_layoutstat when NFS42 is disabled
  nfs: verify open flags before allowing open
  nfs: always update creds in mirror, even when we have an already connected ds
  nfs: fix potential credential leak in ff_layout_update_mirror_cred
  pnfs/flexfiles: report layoutstat regularly
  ...

51 files changed:
fs/nfs/callback.c
fs/nfs/callback_proc.c
fs/nfs/callback_xdr.c
fs/nfs/client.c
fs/nfs/dir.c
fs/nfs/file.c
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/flexfilelayout/flexfilelayout.h
fs/nfs/flexfilelayout/flexfilelayoutdev.c
fs/nfs/inode.c
fs/nfs/nfs3xdr.c
fs/nfs/nfs42.h
fs/nfs/nfs42proc.c
fs/nfs/nfs42xdr.c
fs/nfs/nfs4_fs.h
fs/nfs/nfs4client.c
fs/nfs/nfs4file.c
fs/nfs/nfs4getroot.c
fs/nfs/nfs4idmap.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4state.c
fs/nfs/nfs4xdr.c
fs/nfs/pagelist.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/nfs/write.c
include/linux/nfs4.h
include/linux/nfs_fs.h
include/linux/nfs_fs_sb.h
include/linux/nfs_page.h
include/linux/nfs_xdr.h
include/linux/sunrpc/bc_xprt.h
include/linux/sunrpc/clnt.h
include/linux/sunrpc/sched.h
include/linux/sunrpc/xprt.h
include/linux/sunrpc/xprtrdma.h
net/sunrpc/Makefile
net/sunrpc/backchannel_rqst.c
net/sunrpc/bc_svc.c [deleted file]
net/sunrpc/clnt.c
net/sunrpc/debugfs.c
net/sunrpc/svc.c
net/sunrpc/xprt.c
net/sunrpc/xprtrdma/fmr_ops.c
net/sunrpc/xprtrdma/frwr_ops.c
net/sunrpc/xprtrdma/physical_ops.c
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h
net/sunrpc/xprtsock.c

index 8d129bb7355afbb2ca7f1904ff0263f604e86dd6..682529c009966b85f986955c04d2b48fb645e981 100644 (file)
@@ -458,7 +458,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
  * pg_authenticate method for nfsv4 callback threads.
  *
  * The authflavor has been negotiated, so an incorrect flavor is a server
- * bug. Drop packets with incorrect authflavor.
+ * bug. Deny packets with incorrect authflavor.
  *
  * All other checking done after NFS decoding where the nfs_client can be
  * found in nfs4_callback_compound
@@ -468,12 +468,12 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
        switch (rqstp->rq_authop->flavour) {
        case RPC_AUTH_NULL:
                if (rqstp->rq_proc != CB_NULL)
-                       return SVC_DROP;
+                       return SVC_DENIED;
                break;
        case RPC_AUTH_GSS:
                /* No RPC_AUTH_GSS support yet in NFSv4.1 */
                 if (svc_is_backchannel(rqstp))
-                       return SVC_DROP;
+                       return SVC_DENIED;
        }
        return SVC_OK;
 }
index 197806fb87ffb459c19f3c4bbc8da50c58c870dc..29e3c1b011b73e4661f4deb1ef200e2f8d27792b 100644 (file)
@@ -327,10 +327,8 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
        dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr);
 
        /* Normal */
-       if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
-               slot->seq_nr++;
+       if (likely(args->csa_sequenceid == slot->seq_nr + 1))
                goto out_ok;
-       }
 
        /* Replay */
        if (args->csa_sequenceid == slot->seq_nr) {
@@ -418,6 +416,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
                              struct cb_process_state *cps)
 {
        struct nfs4_slot_table *tbl;
+       struct nfs4_slot *slot;
        struct nfs_client *clp;
        int i;
        __be32 status = htonl(NFS4ERR_BADSESSION);
@@ -429,25 +428,32 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 
        if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
                goto out;
+
        tbl = &clp->cl_session->bc_slot_table;
+       slot = tbl->slots + args->csa_slotid;
 
        spin_lock(&tbl->slot_tbl_lock);
        /* state manager is resetting the session */
        if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
-               spin_unlock(&tbl->slot_tbl_lock);
                status = htonl(NFS4ERR_DELAY);
                /* Return NFS4ERR_BADSESSION if we're draining the session
                 * in order to reset it.
                 */
                if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
                        status = htonl(NFS4ERR_BADSESSION);
-               goto out;
+               goto out_unlock;
        }
 
-       status = validate_seqid(&clp->cl_session->bc_slot_table, args);
-       spin_unlock(&tbl->slot_tbl_lock);
+       memcpy(&res->csr_sessionid, &args->csa_sessionid,
+              sizeof(res->csr_sessionid));
+       res->csr_sequenceid = args->csa_sequenceid;
+       res->csr_slotid = args->csa_slotid;
+       res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+       res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+
+       status = validate_seqid(tbl, args);
        if (status)
-               goto out;
+               goto out_unlock;
 
        cps->slotid = args->csa_slotid;
 
@@ -458,15 +464,17 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
         */
        if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
                status = htonl(NFS4ERR_DELAY);
-               goto out;
+               goto out_unlock;
        }
 
-       memcpy(&res->csr_sessionid, &args->csa_sessionid,
-              sizeof(res->csr_sessionid));
-       res->csr_sequenceid = args->csa_sequenceid;
-       res->csr_slotid = args->csa_slotid;
-       res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
-       res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+       /*
+        * RFC5661 20.9.3
+        * If CB_SEQUENCE returns an error, then the state of the slot
+        * (sequence ID, cached reply) MUST NOT change.
+        */
+       slot->seq_nr++;
+out_unlock:
+       spin_unlock(&tbl->slot_tbl_lock);
 
 out:
        cps->clp = clp; /* put in nfs4_callback_compound */
index 19ca95cdfd9b0f26aedbbc23f036babf2aeca67a..6b1697a01dde35e1384d72a4a7cd2e02d1afbf84 100644 (file)
@@ -909,7 +909,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
 
        status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
-       if (status == __constant_htonl(NFS4ERR_RESOURCE))
+       if (status == htonl(NFS4ERR_RESOURCE))
                return rpc_garbage_args;
 
        if (hdr_arg.minorversion == 0) {
index 892aefff36300a0861f9bf5d43cb7a4b5069d873..ecebb406cc1aec554ce780f2c3680eddc351e8da 100644 (file)
@@ -825,7 +825,6 @@ error:
  * Load up the server record from information gained in an fsinfo record
  */
 static void nfs_server_set_fsinfo(struct nfs_server *server,
-                                 struct nfs_fh *mntfh,
                                  struct nfs_fsinfo *fsinfo)
 {
        unsigned long max_rpc_payload;
@@ -901,7 +900,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs
        if (error < 0)
                goto out_error;
 
-       nfs_server_set_fsinfo(server, mntfh, &fsinfo);
+       nfs_server_set_fsinfo(server, &fsinfo);
 
        /* Get some general file system info */
        if (server->namelen == 0) {
@@ -1193,8 +1192,6 @@ void nfs_clients_init(struct net *net)
 }
 
 #ifdef CONFIG_PROC_FS
-static struct proc_dir_entry *proc_fs_nfs;
-
 static int nfs_server_list_open(struct inode *inode, struct file *file);
 static void *nfs_server_list_start(struct seq_file *p, loff_t *pos);
 static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
@@ -1364,27 +1361,29 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
 {
        struct nfs_server *server;
        struct nfs_client *clp;
-       char dev[8], fsid[17];
+       char dev[13];   // 8 for 2^24, 1 for ':', 3 for 2^8, 1 for '\0'
+       char fsid[34];  // 2 * 16 for %llx, 1 for ':', 1 for '\0'
        struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
 
        /* display header on line 1 */
        if (v == &nn->nfs_volume_list) {
-               seq_puts(m, "NV SERVER   PORT DEV     FSID              FSC\n");
+               seq_puts(m, "NV SERVER   PORT DEV          FSID"
+                           "                              FSC\n");
                return 0;
        }
        /* display one transport per line on subsequent lines */
        server = list_entry(v, struct nfs_server, master_link);
        clp = server->nfs_client;
 
-       snprintf(dev, 8, "%u:%u",
+       snprintf(dev, sizeof(dev), "%u:%u",
                 MAJOR(server->s_dev), MINOR(server->s_dev));
 
-       snprintf(fsid, 17, "%llx:%llx",
+       snprintf(fsid, sizeof(fsid), "%llx:%llx",
                 (unsigned long long) server->fsid.major,
                 (unsigned long long) server->fsid.minor);
 
        rcu_read_lock();
-       seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
+       seq_printf(m, "v%u %s %s %-12s %-33s %s\n",
                   clp->rpc_ops->version,
                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
@@ -1434,27 +1433,20 @@ void nfs_fs_proc_net_exit(struct net *net)
  */
 int __init nfs_fs_proc_init(void)
 {
-       struct proc_dir_entry *p;
-
-       proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL);
-       if (!proc_fs_nfs)
+       if (!proc_mkdir("fs/nfsfs", NULL))
                goto error_0;
 
        /* a file of servers with which we're dealing */
-       p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers");
-       if (!p)
+       if (!proc_symlink("fs/nfsfs/servers", NULL, "../../net/nfsfs/servers"))
                goto error_1;
 
        /* a file of volumes that we have mounted */
-       p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes");
-       if (!p)
-               goto error_2;
-       return 0;
+       if (!proc_symlink("fs/nfsfs/volumes", NULL, "../../net/nfsfs/volumes"))
+               goto error_1;
 
-error_2:
-       remove_proc_entry("servers", proc_fs_nfs);
+       return 0;
 error_1:
-       remove_proc_entry("fs/nfsfs", NULL);
+       remove_proc_subtree("fs/nfsfs", NULL);
 error_0:
        return -ENOMEM;
 }
@@ -1464,9 +1456,7 @@ error_0:
  */
 void nfs_fs_proc_exit(void)
 {
-       remove_proc_entry("volumes", proc_fs_nfs);
-       remove_proc_entry("servers", proc_fs_nfs);
-       remove_proc_entry("fs/nfsfs", NULL);
+       remove_proc_subtree("fs/nfsfs", NULL);
 }
 
 #endif /* CONFIG_PROC_FS */
index b2c8b31b2be77d9a1d524b230ed2b66e479ad3fe..21457bb0edd62b42af307d5850b336711f41e82f 100644 (file)
@@ -1470,9 +1470,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
 {
        int err;
 
-       if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
-               *opened |= FILE_CREATED;
-
        err = finish_open(file, dentry, do_open, opened);
        if (err)
                goto out;
index 8b8d83a526ce2366ae974a87761c9c62c22da7f5..cc4fa1ed61fc5bdfe04d1afcaa5f081bb3ba0470 100644 (file)
@@ -555,31 +555,22 @@ static int nfs_launder_page(struct page *page)
        return nfs_wb_page(inode, page);
 }
 
-#ifdef CONFIG_NFS_SWAP
 static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
                                                sector_t *span)
 {
-       int ret;
        struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
 
        *span = sis->pages;
 
-       rcu_read_lock();
-       ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
-       rcu_read_unlock();
-
-       return ret;
+       return rpc_clnt_swap_activate(clnt);
 }
 
 static void nfs_swap_deactivate(struct file *file)
 {
        struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
 
-       rcu_read_lock();
-       xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
-       rcu_read_unlock();
+       rpc_clnt_swap_deactivate(clnt);
 }
-#endif
 
 const struct address_space_operations nfs_file_aops = {
        .readpage = nfs_readpage,
@@ -596,10 +587,8 @@ const struct address_space_operations nfs_file_aops = {
        .launder_page = nfs_launder_page,
        .is_dirty_writeback = nfs_check_dirty_writeback,
        .error_remove_page = generic_error_remove_page,
-#ifdef CONFIG_NFS_SWAP
        .swap_activate = nfs_swap_activate,
        .swap_deactivate = nfs_swap_deactivate,
-#endif
 };
 
 /*
index 7d05089e52d6c8b7a29e92e80011bdee0e06a32c..c12951b9551eab8b0394ed2aa218cf15ce6f2c39 100644 (file)
@@ -20,6 +20,7 @@
 #include "../nfs4trace.h"
 #include "../iostat.h"
 #include "../nfs.h"
+#include "../nfs42.h"
 
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 
@@ -182,17 +183,14 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
 
 static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
 {
-       struct nfs4_ff_layout_mirror *tmp;
        int i, j;
 
        for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
                for (j = i + 1; j < fls->mirror_array_cnt; j++)
                        if (fls->mirror_array[i]->efficiency <
-                           fls->mirror_array[j]->efficiency) {
-                               tmp = fls->mirror_array[i];
-                               fls->mirror_array[i] = fls->mirror_array[j];
-                               fls->mirror_array[j] = tmp;
-                       }
+                           fls->mirror_array[j]->efficiency)
+                               swap(fls->mirror_array[i],
+                                    fls->mirror_array[j]);
        }
 }
 
@@ -274,6 +272,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 
                spin_lock_init(&fls->mirror_array[i]->lock);
                fls->mirror_array[i]->ds_count = ds_count;
+               fls->mirror_array[i]->lseg = &fls->generic_hdr;
 
                /* deviceid */
                rc = decode_deviceid(&stream, &devid);
@@ -344,6 +343,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
                        fls->mirror_array[i]->gid);
        }
 
+       p = xdr_inline_decode(&stream, 4);
+       if (p)
+               fls->flags = be32_to_cpup(p);
+
        ff_layout_sort_mirrors(fls);
        rc = ff_layout_check_layout(lgr);
        if (rc)
@@ -415,6 +418,146 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
        return 1;
 }
 
+static void
+nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer)
+{
+       /* first IO request? */
+       if (atomic_inc_return(&timer->n_ops) == 1) {
+               timer->start_time = ktime_get();
+       }
+}
+
+static ktime_t
+nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer)
+{
+       ktime_t start, now;
+
+       if (atomic_dec_return(&timer->n_ops) < 0)
+               WARN_ON_ONCE(1);
+
+       now = ktime_get();
+       start = timer->start_time;
+       timer->start_time = now;
+       return ktime_sub(now, start);
+}
+
+static ktime_t
+nfs4_ff_layout_calc_completion_time(struct rpc_task *task)
+{
+       return ktime_sub(ktime_get(), task->tk_start);
+}
+
+static bool
+nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
+                           struct nfs4_ff_layoutstat *layoutstat)
+{
+       static const ktime_t notime = {0};
+       ktime_t now = ktime_get();
+
+       nfs4_ff_start_busy_timer(&layoutstat->busy_timer);
+       if (ktime_equal(mirror->start_time, notime))
+               mirror->start_time = now;
+       if (ktime_equal(mirror->last_report_time, notime))
+               mirror->last_report_time = now;
+       if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
+                       FF_LAYOUTSTATS_REPORT_INTERVAL) {
+               mirror->last_report_time = now;
+               return true;
+       }
+
+       return false;
+}
+
+static void
+nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat *layoutstat,
+               __u64 requested)
+{
+       struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+
+       iostat->ops_requested++;
+       iostat->bytes_requested += requested;
+}
+
+static void
+nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
+               __u64 requested,
+               __u64 completed,
+               ktime_t time_completed)
+{
+       struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+       ktime_t timer;
+
+       iostat->ops_completed++;
+       iostat->bytes_completed += completed;
+       iostat->bytes_not_delivered += requested - completed;
+
+       timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer);
+       iostat->total_busy_time =
+                       ktime_add(iostat->total_busy_time, timer);
+       iostat->aggregate_completion_time =
+                       ktime_add(iostat->aggregate_completion_time, time_completed);
+}
+
+static void
+nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror,
+               __u64 requested)
+{
+       bool report;
+
+       spin_lock(&mirror->lock);
+       report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat);
+       nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
+       spin_unlock(&mirror->lock);
+
+       if (report)
+               pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+}
+
+static void
+nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
+               struct nfs4_ff_layout_mirror *mirror,
+               __u64 requested,
+               __u64 completed)
+{
+       spin_lock(&mirror->lock);
+       nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
+                       requested, completed,
+                       nfs4_ff_layout_calc_completion_time(task));
+       spin_unlock(&mirror->lock);
+}
+
+static void
+nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror,
+               __u64 requested)
+{
+       bool report;
+
+       spin_lock(&mirror->lock);
+       report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat);
+       nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
+       spin_unlock(&mirror->lock);
+
+       if (report)
+               pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+}
+
+static void
+nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
+               struct nfs4_ff_layout_mirror *mirror,
+               __u64 requested,
+               __u64 completed,
+               enum nfs3_stable_how committed)
+{
+       if (committed == NFS_UNSTABLE)
+               requested = completed = 0;
+
+       spin_lock(&mirror->lock);
+       nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
+                       requested, completed,
+                       nfs4_ff_layout_calc_completion_time(task));
+       spin_unlock(&mirror->lock);
+}
+
 static int
 ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
                            struct nfs_commit_info *cinfo,
@@ -631,7 +774,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
                        nfs_direct_set_resched_writes(hdr->dreq);
                        /* fake unstable write to let common nfs resend pages */
                        hdr->verf.committed = NFS_UNSTABLE;
-                       hdr->good_bytes = 0;
+                       hdr->good_bytes = hdr->args.count;
                }
                return;
        }
@@ -879,6 +1022,12 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
        return 0;
 }
 
+static bool
+ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
+{
+       return !(FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_LAYOUTCOMMIT);
+}
+
 /*
  * We reference the rpc_cred of the first WRITE that triggers the need for
  * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
@@ -891,6 +1040,9 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
 static void
 ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
+       if (!ff_layout_need_layoutcommit(hdr->lseg))
+               return;
+
        pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
                        hdr->mds_offset + hdr->res.count);
        dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
@@ -909,6 +1061,10 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
 static int ff_layout_read_prepare_common(struct rpc_task *task,
                                         struct nfs_pgio_header *hdr)
 {
+       nfs4_ff_layout_stat_io_start_read(
+                       FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+                       hdr->args.count);
+
        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
                rpc_exit(task, -EIO);
                return -EIO;
@@ -962,15 +1118,15 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
 {
        struct nfs_pgio_header *hdr = data;
 
-       if (ff_layout_read_prepare_common(task, hdr))
-               return;
-
        if (ff_layout_setup_sequence(hdr->ds_clp,
                                     &hdr->args.seq_args,
                                     &hdr->res.seq_res,
                                     task))
                return;
 
+       if (ff_layout_read_prepare_common(task, hdr))
+               return;
+
        if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
                        hdr->args.lock_context, FMODE_READ) == -EIO)
                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
@@ -982,6 +1138,10 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data)
 
        dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
 
+       nfs4_ff_layout_stat_io_end_read(task,
+                       FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+                       hdr->args.count, hdr->res.count);
+
        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
            task->tk_status == 0) {
                nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1074,7 +1234,8 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
                return -EAGAIN;
        }
 
-       if (data->verf.committed == NFS_UNSTABLE)
+       if (data->verf.committed == NFS_UNSTABLE
+           && ff_layout_need_layoutcommit(data->lseg))
                pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
 
        return 0;
@@ -1083,6 +1244,10 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 static int ff_layout_write_prepare_common(struct rpc_task *task,
                                          struct nfs_pgio_header *hdr)
 {
+       nfs4_ff_layout_stat_io_start_write(
+                       FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+                       hdr->args.count);
+
        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
                rpc_exit(task, -EIO);
                return -EIO;
@@ -1116,15 +1281,15 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
 {
        struct nfs_pgio_header *hdr = data;
 
-       if (ff_layout_write_prepare_common(task, hdr))
-               return;
-
        if (ff_layout_setup_sequence(hdr->ds_clp,
                                     &hdr->args.seq_args,
                                     &hdr->res.seq_res,
                                     task))
                return;
 
+       if (ff_layout_write_prepare_common(task, hdr))
+               return;
+
        if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
                        hdr->args.lock_context, FMODE_WRITE) == -EIO)
                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
@@ -1134,6 +1299,11 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data)
 {
        struct nfs_pgio_header *hdr = data;
 
+       nfs4_ff_layout_stat_io_end_write(task,
+                       FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+                       hdr->args.count, hdr->res.count,
+                       hdr->res.verf->committed);
+
        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
            task->tk_status == 0) {
                nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1152,8 +1322,17 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
            &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
 }
 
+static void ff_layout_commit_prepare_common(struct rpc_task *task,
+               struct nfs_commit_data *cdata)
+{
+       nfs4_ff_layout_stat_io_start_write(
+                       FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+                       0);
+}
+
 static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
 {
+       ff_layout_commit_prepare_common(task, data);
        rpc_call_start(task);
 }
 
@@ -1161,10 +1340,30 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
 {
        struct nfs_commit_data *wdata = data;
 
-       ff_layout_setup_sequence(wdata->ds_clp,
+       if (ff_layout_setup_sequence(wdata->ds_clp,
                                 &wdata->args.seq_args,
                                 &wdata->res.seq_res,
-                                task);
+                                task))
+               return;
+       ff_layout_commit_prepare_common(task, data);
+}
+
+static void ff_layout_commit_done(struct rpc_task *task, void *data)
+{
+       struct nfs_commit_data *cdata = data;
+       struct nfs_page *req;
+       __u64 count = 0;
+
+       if (task->tk_status == 0) {
+               list_for_each_entry(req, &cdata->pages, wb_list)
+                       count += req->wb_bytes;
+       }
+
+       nfs4_ff_layout_stat_io_end_write(task,
+                       FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+                       count, count, NFS_FILE_SYNC);
+
+       pnfs_generic_write_commit_done(task, data);
 }
 
 static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
@@ -1205,14 +1404,14 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
 
 static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
        .rpc_call_prepare = ff_layout_commit_prepare_v3,
-       .rpc_call_done = pnfs_generic_write_commit_done,
+       .rpc_call_done = ff_layout_commit_done,
        .rpc_count_stats = ff_layout_commit_count_stats,
        .rpc_release = pnfs_generic_commit_release,
 };
 
 static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
        .rpc_call_prepare = ff_layout_commit_prepare_v4,
-       .rpc_call_done = pnfs_generic_write_commit_done,
+       .rpc_call_done = ff_layout_commit_done,
        .rpc_count_stats = ff_layout_commit_count_stats,
        .rpc_release = pnfs_generic_commit_release,
 };
@@ -1256,7 +1455,6 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
        fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
        if (fh)
                hdr->args.fh = fh;
-
        /*
         * Note that if we ever decide to split across DSes,
         * then we may need to handle dense-like offsets.
@@ -1385,6 +1583,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
        fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
        if (fh)
                data->args.fh = fh;
+
        return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
                                   vers == 3 ? &ff_layout_commit_call_ops_v3 :
                                               &ff_layout_commit_call_ops_v4,
@@ -1488,6 +1687,247 @@ out:
        dprintk("%s: Return\n", __func__);
 }
 
+static int
+ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
+{
+       const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+
+       return snprintf(buf, buflen, "%pI4", &sin->sin_addr);
+}
+
+static size_t
+ff_layout_ntop6_noscopeid(const struct sockaddr *sap, char *buf,
+                         const int buflen)
+{
+       const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+       const struct in6_addr *addr = &sin6->sin6_addr;
+
+       /*
+        * RFC 4291, Section 2.2.2
+        *
+        * Shorthanded ANY address
+        */
+       if (ipv6_addr_any(addr))
+               return snprintf(buf, buflen, "::");
+
+       /*
+        * RFC 4291, Section 2.2.2
+        *
+        * Shorthanded loopback address
+        */
+       if (ipv6_addr_loopback(addr))
+               return snprintf(buf, buflen, "::1");
+
+       /*
+        * RFC 4291, Section 2.2.3
+        *
+        * Special presentation address format for mapped v4
+        * addresses.
+        */
+       if (ipv6_addr_v4mapped(addr))
+               return snprintf(buf, buflen, "::ffff:%pI4",
+                                       &addr->s6_addr32[3]);
+
+       /*
+        * RFC 4291, Section 2.2.1
+        */
+       return snprintf(buf, buflen, "%pI6c", addr);
+}
+
+/* Derived from rpc_sockaddr2uaddr */
+static void
+ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da)
+{
+       struct sockaddr *sap = (struct sockaddr *)&da->da_addr;
+       char portbuf[RPCBIND_MAXUADDRPLEN];
+       char addrbuf[RPCBIND_MAXUADDRLEN];
+       char *netid;
+       unsigned short port;
+       int len, netid_len;
+       __be32 *p;
+
+       switch (sap->sa_family) {
+       case AF_INET:
+               if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0)
+                       return;
+               port = ntohs(((struct sockaddr_in *)sap)->sin_port);
+               netid = "tcp";
+               netid_len = 3;
+               break;
+       case AF_INET6:
+               if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0)
+                       return;
+               port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
+               netid = "tcp6";
+               netid_len = 4;
+               break;
+       default:
+               /* we only support tcp and tcp6 */
+               WARN_ON_ONCE(1);
+               return;
+       }
+
+       snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff);
+       len = strlcat(addrbuf, portbuf, sizeof(addrbuf));
+
+       p = xdr_reserve_space(xdr, 4 + netid_len);
+       xdr_encode_opaque(p, netid, netid_len);
+
+       p = xdr_reserve_space(xdr, 4 + len);
+       xdr_encode_opaque(p, addrbuf, len);
+}
+
+static void
+ff_layout_encode_nfstime(struct xdr_stream *xdr,
+                        ktime_t t)
+{
+       struct timespec64 ts;
+       __be32 *p;
+
+       p = xdr_reserve_space(xdr, 12);
+       ts = ktime_to_timespec64(t);
+       p = xdr_encode_hyper(p, ts.tv_sec);
+       *p++ = cpu_to_be32(ts.tv_nsec);
+}
+
+static void
+ff_layout_encode_io_latency(struct xdr_stream *xdr,
+                           struct nfs4_ff_io_stat *stat)
+{
+       __be32 *p;
+
+       p = xdr_reserve_space(xdr, 5 * 8);
+       p = xdr_encode_hyper(p, stat->ops_requested);
+       p = xdr_encode_hyper(p, stat->bytes_requested);
+       p = xdr_encode_hyper(p, stat->ops_completed);
+       p = xdr_encode_hyper(p, stat->bytes_completed);
+       p = xdr_encode_hyper(p, stat->bytes_not_delivered);
+       ff_layout_encode_nfstime(xdr, stat->total_busy_time);
+       ff_layout_encode_nfstime(xdr, stat->aggregate_completion_time);
+}
+
+static void
+ff_layout_encode_layoutstats(struct xdr_stream *xdr,
+                            struct nfs42_layoutstat_args *args,
+                            struct nfs42_layoutstat_devinfo *devinfo)
+{
+       struct nfs4_ff_layout_mirror *mirror = devinfo->layout_private;
+       struct nfs4_pnfs_ds_addr *da;
+       struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
+       struct nfs_fh *fh = &mirror->fh_versions[0];
+       __be32 *p, *start;
+
+       da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
+       dprintk("%s: DS %s: encoding address %s\n",
+               __func__, ds->ds_remotestr, da->da_remotestr);
+       /* layoutupdate length */
+       start = xdr_reserve_space(xdr, 4);
+       /* netaddr4 */
+       ff_layout_encode_netaddr(xdr, da);
+       /* nfs_fh4 */
+       p = xdr_reserve_space(xdr, 4 + fh->size);
+       xdr_encode_opaque(p, fh->data, fh->size);
+       /* ff_io_latency4 read */
+       spin_lock(&mirror->lock);
+       ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat);
+       /* ff_io_latency4 write */
+       ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat);
+       spin_unlock(&mirror->lock);
+       /* nfstime4 */
+       ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time));
+       /* bool */
+       p = xdr_reserve_space(xdr, 4);
+       *p = cpu_to_be32(false);
+
+       *start = cpu_to_be32((xdr->p - start - 1) * 4);
+}
+
+static bool
+ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
+                              struct pnfs_layout_segment *pls,
+                              int *dev_count, int dev_limit)
+{
+       struct nfs4_ff_layout_mirror *mirror;
+       struct nfs4_deviceid_node *dev;
+       struct nfs42_layoutstat_devinfo *devinfo;
+       int i;
+
+       for (i = 0; i <= FF_LAYOUT_MIRROR_COUNT(pls); i++) {
+               if (*dev_count >= dev_limit)
+                       break;
+               mirror = FF_LAYOUT_COMP(pls, i);
+               if (!mirror || !mirror->mirror_ds)
+                       continue;
+               dev = FF_LAYOUT_DEVID_NODE(pls, i);
+               devinfo = &args->devinfo[*dev_count];
+               memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
+               devinfo->offset = pls->pls_range.offset;
+               devinfo->length = pls->pls_range.length;
+               /* well, we don't really know if IO is continuous or not! */
+               devinfo->read_count = mirror->read_stat.io_stat.bytes_completed;
+               devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
+               devinfo->write_count = mirror->write_stat.io_stat.bytes_completed;
+               devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
+               devinfo->layout_type = LAYOUT_FLEX_FILES;
+               devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
+               devinfo->layout_private = mirror;
+               /* lseg refcount put in cleanup_layoutstats */
+               pnfs_get_lseg(pls);
+
+               ++(*dev_count);
+       }
+
+       return *dev_count < dev_limit;
+}
+
+static int
+ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
+{
+       struct pnfs_layout_segment *pls;
+       int dev_count = 0;
+
+       spin_lock(&args->inode->i_lock);
+       list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
+               dev_count += FF_LAYOUT_MIRROR_COUNT(pls);
+       }
+       spin_unlock(&args->inode->i_lock);
+       /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
+       if (dev_count > PNFS_LAYOUTSTATS_MAXDEV) {
+               dprintk("%s: truncating devinfo to limit (%d:%d)\n",
+                       __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
+               dev_count = PNFS_LAYOUTSTATS_MAXDEV;
+       }
+       args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL);
+       if (!args->devinfo)
+               return -ENOMEM;
+
+       dev_count = 0;
+       spin_lock(&args->inode->i_lock);
+       list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
+               if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count,
+                                                   PNFS_LAYOUTSTATS_MAXDEV)) {
+                       break;
+               }
+       }
+       spin_unlock(&args->inode->i_lock);
+       args->num_dev = dev_count;
+
+       return 0;
+}
+
+static void
+ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
+{
+       struct nfs4_ff_layout_mirror *mirror;
+       int i;
+
+       for (i = 0; i < data->args.num_dev; i++) {
+               mirror = data->args.devinfo[i].layout_private;
+               data->args.devinfo[i].layout_private = NULL;
+               pnfs_put_lseg(mirror->lseg);
+       }
+}
+
 static struct pnfs_layoutdriver_type flexfilelayout_type = {
        .id                     = LAYOUT_FLEX_FILES,
        .name                   = "LAYOUT_FLEX_FILES",
@@ -1510,6 +1950,8 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
        .alloc_deviceid_node    = ff_layout_alloc_deviceid_node,
        .encode_layoutreturn    = ff_layout_encode_layoutreturn,
        .sync                   = pnfs_nfs_generic_sync,
+       .prepare_layoutstats    = ff_layout_prepare_layoutstats,
+       .cleanup_layoutstats    = ff_layout_cleanup_layoutstats,
 };
 
 static int __init nfs4flexfilelayout_init(void)
index 070f20445b2d33883445038d4888538845555198..f92f9a0a856b3e698c8859923438549d1bffed37 100644 (file)
@@ -9,12 +9,17 @@
 #ifndef FS_NFS_NFS4FLEXFILELAYOUT_H
 #define FS_NFS_NFS4FLEXFILELAYOUT_H
 
+#define FF_FLAGS_NO_LAYOUTCOMMIT 1
+
 #include "../pnfs.h"
 
 /* XXX: Let's filter out insanely large mirror count for now to avoid oom
  * due to network error etc. */
 #define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
 
+/* LAYOUTSTATS report interval in ms */
+#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L)
+
 struct nfs4_ff_ds_version {
        u32                             version;
        u32                             minor_version;
@@ -41,24 +46,48 @@ struct nfs4_ff_layout_ds_err {
        struct nfs4_deviceid            deviceid;
 };
 
+struct nfs4_ff_io_stat {
+       __u64                           ops_requested;
+       __u64                           bytes_requested;
+       __u64                           ops_completed;
+       __u64                           bytes_completed;
+       __u64                           bytes_not_delivered;
+       ktime_t                         total_busy_time;
+       ktime_t                         aggregate_completion_time;
+};
+
+struct nfs4_ff_busy_timer {
+       ktime_t start_time;
+       atomic_t n_ops;
+};
+
+struct nfs4_ff_layoutstat {
+       struct nfs4_ff_io_stat io_stat;
+       struct nfs4_ff_busy_timer busy_timer;
+};
+
 struct nfs4_ff_layout_mirror {
+       struct pnfs_layout_segment      *lseg; /* back pointer */
        u32                             ds_count;
        u32                             efficiency;
        struct nfs4_ff_layout_ds        *mirror_ds;
        u32                             fh_versions_cnt;
        struct nfs_fh                   *fh_versions;
        nfs4_stateid                    stateid;
-       struct nfs4_string              user_name;
-       struct nfs4_string              group_name;
        u32                             uid;
        u32                             gid;
        struct rpc_cred                 *cred;
        spinlock_t                      lock;
+       struct nfs4_ff_layoutstat       read_stat;
+       struct nfs4_ff_layoutstat       write_stat;
+       ktime_t                         start_time;
+       ktime_t                         last_report_time;
 };
 
 struct nfs4_ff_layout_segment {
        struct pnfs_layout_segment      generic_hdr;
        u64                             stripe_unit;
+       u32                             flags;
        u32                             mirror_array_cnt;
        struct nfs4_ff_layout_mirror    **mirror_array;
 };
index 77a2d026aa12b62bdc29dac2345cff0b3237e9c4..f13e1969eedd911bf6a5d9be6af6e4ae403f6c1e 100644 (file)
@@ -324,7 +324,8 @@ static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
                                __func__, PTR_ERR(cred));
                        return PTR_ERR(cred);
                } else {
-                       mirror->cred = cred;
+                       if (cmpxchg(&mirror->cred, NULL, cred))
+                               put_rpccred(cred);
                }
        }
        return 0;
@@ -386,7 +387,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
        /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
        smp_rmb();
        if (ds->ds_clp)
-               goto out;
+               goto out_update_creds;
 
        flavor = nfs4_ff_layout_choose_authflavor(mirror);
 
@@ -430,7 +431,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
                        }
                }
        }
-
+out_update_creds:
        if (ff_layout_update_mirror_cred(mirror, ds))
                ds = NULL;
 out:
index f734562c6d244034cb5036fee8ab0b7d69cc90c5..b77b328a06d74f0124d2a65b51fac0fc21fbd692 100644 (file)
@@ -678,6 +678,8 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
        if (!err) {
                generic_fillattr(inode, stat);
                stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
+               if (S_ISDIR(inode->i_mode))
+                       stat->blksize = NFS_SERVER(inode)->dtsize;
        }
 out:
        trace_nfs_getattr_exit(inode, err);
@@ -2008,17 +2010,15 @@ static int __init init_nfs_fs(void)
        if (err)
                goto out1;
 
-#ifdef CONFIG_PROC_FS
        rpc_proc_register(&init_net, &nfs_rpcstat);
-#endif
-       if ((err = register_nfs_fs()) != 0)
+
+       err = register_nfs_fs();
+       if (err)
                goto out0;
 
        return 0;
 out0:
-#ifdef CONFIG_PROC_FS
        rpc_proc_unregister(&init_net, "nfs");
-#endif
        nfs_destroy_directcache();
 out1:
        nfs_destroy_writepagecache();
@@ -2049,9 +2049,7 @@ static void __exit exit_nfs_fs(void)
        nfs_destroy_nfspagecache();
        nfs_fscache_unregister();
        unregister_pernet_subsys(&nfs_net_ops);
-#ifdef CONFIG_PROC_FS
        rpc_proc_unregister(&init_net, "nfs");
-#endif
        unregister_nfs_fs();
        nfs_fs_proc_exit();
        nfsiod_stop();
index 53852a4bd88be68781bb9dd8a39760fd497d1d61..9b04c2e6fffc3f306f3c598b7c4557beff653c8e 100644 (file)
@@ -1342,7 +1342,7 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
        if (args->npages != 0)
                xdr_write_pages(xdr, args->pages, 0, args->len);
        else
-               xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE);
+               xdr_reserve_space(xdr, args->len);
 
        error = nfsacl_encode(xdr->buf, base, args->inode,
                            (args->mask & NFS_ACL) ?
index 7afb8947dfdf3e299ee39c6490640f40567a70b7..ff66ae700b8991eeed513e210397f6639c0e5f70 100644 (file)
@@ -5,11 +5,18 @@
 #ifndef __LINUX_FS_NFS_NFS4_2_H
 #define __LINUX_FS_NFS_NFS4_2_H
 
+/*
+ * FIXME:  four LAYOUTSTATS calls per compound at most! Do we need to support
+ * more? Need to consider not to pre-alloc too much for a compound.
+ */
+#define PNFS_LAYOUTSTATS_MAXDEV (4)
+
 /* nfs4.2proc.c */
 int nfs42_proc_allocate(struct file *, loff_t, loff_t);
 int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
 loff_t nfs42_proc_llseek(struct file *, loff_t, int);
-
+int nfs42_proc_layoutstats_generic(struct nfs_server *,
+                                  struct nfs42_layoutstat_data *);
 /* nfs4.2xdr.h */
 extern struct rpc_procinfo nfs4_2_procedures[];
 
index 3a9e75235f30e60e5a4ae974864c63fb003b969d..f486b80f927ab7204159852a9740900a6c73aec6 100644 (file)
 #include <linux/nfs_fs.h>
 #include "nfs4_fs.h"
 #include "nfs42.h"
+#include "iostat.h"
+#include "pnfs.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS
 
 static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
                                fmode_t fmode)
@@ -165,3 +170,85 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
 
        return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
 }
+
+static void
+nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata)
+{
+       struct nfs42_layoutstat_data *data = calldata;
+       struct nfs_server *server = NFS_SERVER(data->args.inode);
+
+       nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args,
+                            &data->res.seq_res, task);
+}
+
+static void
+nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
+{
+       struct nfs42_layoutstat_data *data = calldata;
+
+       if (!nfs4_sequence_done(task, &data->res.seq_res))
+               return;
+
+       switch (task->tk_status) {
+       case 0:
+               break;
+       case -ENOTSUPP:
+       case -EOPNOTSUPP:
+               NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
+       default:
+               dprintk("%s server returns %d\n", __func__, task->tk_status);
+       }
+}
+
+static void
+nfs42_layoutstat_release(void *calldata)
+{
+       struct nfs42_layoutstat_data *data = calldata;
+       struct nfs_server *nfss = NFS_SERVER(data->args.inode);
+
+       if (nfss->pnfs_curr_ld->cleanup_layoutstats)
+               nfss->pnfs_curr_ld->cleanup_layoutstats(data);
+
+       pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout);
+       smp_mb__before_atomic();
+       clear_bit(NFS_INO_LAYOUTSTATS, &NFS_I(data->args.inode)->flags);
+       smp_mb__after_atomic();
+       nfs_iput_and_deactive(data->inode);
+       kfree(data->args.devinfo);
+       kfree(data);
+}
+
+static const struct rpc_call_ops nfs42_layoutstat_ops = {
+       .rpc_call_prepare = nfs42_layoutstat_prepare,
+       .rpc_call_done = nfs42_layoutstat_done,
+       .rpc_release = nfs42_layoutstat_release,
+};
+
+int nfs42_proc_layoutstats_generic(struct nfs_server *server,
+                                  struct nfs42_layoutstat_data *data)
+{
+       struct rpc_message msg = {
+               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTSTATS],
+               .rpc_argp = &data->args,
+               .rpc_resp = &data->res,
+       };
+       struct rpc_task_setup task_setup = {
+               .rpc_client = server->client,
+               .rpc_message = &msg,
+               .callback_ops = &nfs42_layoutstat_ops,
+               .callback_data = data,
+               .flags = RPC_TASK_ASYNC,
+       };
+       struct rpc_task *task;
+
+       data->inode = nfs_igrab_and_active(data->args.inode);
+       if (!data->inode) {
+               nfs42_layoutstat_release(data);
+               return -EAGAIN;
+       }
+       nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+       task = rpc_run_task(&task_setup);
+       if (IS_ERR(task))
+               return PTR_ERR(task);
+       return 0;
+}
index 1a25b27248f2ff5fd0026b00a3032589ee8cdff5..a6bd27da6286f9fee14f0b087eddcc1ec437cdde 100644 (file)
@@ -4,6 +4,8 @@
 #ifndef __LINUX_FS_NFS_NFS4_2XDR_H
 #define __LINUX_FS_NFS_NFS4_2XDR_H
 
+#include "nfs42.h"
+
 #define encode_fallocate_maxsz         (encode_stateid_maxsz + \
                                         2 /* offset */ + \
                                         2 /* length */)
                                         1 /* whence */ + \
                                         2 /* offset */ + \
                                         2 /* length */)
+#define encode_io_info_maxsz           4
+#define encode_layoutstats_maxsz       (op_decode_hdr_maxsz + \
+                                       2 /* offset */ + \
+                                       2 /* length */ + \
+                                       encode_stateid_maxsz + \
+                                       encode_io_info_maxsz + \
+                                       encode_io_info_maxsz + \
+                                       1 /* opaque devaddr4 length */ + \
+                                       XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
+#define decode_layoutstats_maxsz       (op_decode_hdr_maxsz)
 
 #define NFS4_enc_allocate_sz           (compound_encode_hdr_maxsz + \
                                         encode_putfh_maxsz + \
 #define NFS4_dec_seek_sz               (compound_decode_hdr_maxsz + \
                                         decode_putfh_maxsz + \
                                         decode_seek_maxsz)
+#define NFS4_enc_layoutstats_sz                (compound_encode_hdr_maxsz + \
+                                        encode_sequence_maxsz + \
+                                        encode_putfh_maxsz + \
+                                        PNFS_LAYOUTSTATS_MAXDEV * encode_layoutstats_maxsz)
+#define NFS4_dec_layoutstats_sz                (compound_decode_hdr_maxsz + \
+                                        decode_sequence_maxsz + \
+                                        decode_putfh_maxsz + \
+                                        PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
 
 
 static void encode_fallocate(struct xdr_stream *xdr,
@@ -81,6 +101,33 @@ static void encode_seek(struct xdr_stream *xdr,
        encode_uint32(xdr, args->sa_what);
 }
 
+static void encode_layoutstats(struct xdr_stream *xdr,
+                              struct nfs42_layoutstat_args *args,
+                              struct nfs42_layoutstat_devinfo *devinfo,
+                              struct compound_hdr *hdr)
+{
+       __be32 *p;
+
+       encode_op_hdr(xdr, OP_LAYOUTSTATS, decode_layoutstats_maxsz, hdr);
+       p = reserve_space(xdr, 8 + 8);
+       p = xdr_encode_hyper(p, devinfo->offset);
+       p = xdr_encode_hyper(p, devinfo->length);
+       encode_nfs4_stateid(xdr, &args->stateid);
+       p = reserve_space(xdr, 4*8 + NFS4_DEVICEID4_SIZE + 4);
+       p = xdr_encode_hyper(p, devinfo->read_count);
+       p = xdr_encode_hyper(p, devinfo->read_bytes);
+       p = xdr_encode_hyper(p, devinfo->write_count);
+       p = xdr_encode_hyper(p, devinfo->write_bytes);
+       p = xdr_encode_opaque_fixed(p, devinfo->dev_id.data,
+                       NFS4_DEVICEID4_SIZE);
+       /* Encode layoutupdate4 */
+       *p++ = cpu_to_be32(devinfo->layout_type);
+       if (devinfo->layoutstats_encode != NULL)
+               devinfo->layoutstats_encode(xdr, args, devinfo);
+       else
+               encode_uint32(xdr, 0);
+}
+
 /*
  * Encode ALLOCATE request
  */
@@ -137,6 +184,28 @@ static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
        encode_nops(&hdr);
 }
 
+/*
+ * Encode LAYOUTSTATS request
+ */
+static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    struct nfs42_layoutstat_args *args)
+{
+       int i;
+
+       struct compound_hdr hdr = {
+               .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+       };
+
+       encode_compound_hdr(xdr, req, &hdr);
+       encode_sequence(xdr, &args->seq_args, &hdr);
+       encode_putfh(xdr, args->fh, &hdr);
+       WARN_ON(args->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
+       for (i = 0; i < args->num_dev; i++)
+               encode_layoutstats(xdr, args, &args->devinfo[i], &hdr);
+       encode_nops(&hdr);
+}
+
 static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
 {
        return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -169,6 +238,12 @@ out_overflow:
        return -EIO;
 }
 
+static int decode_layoutstats(struct xdr_stream *xdr,
+                             struct nfs42_layoutstat_res *res)
+{
+       return decode_op_hdr(xdr, OP_LAYOUTSTATS);
+}
+
 /*
  * Decode ALLOCATE request
  */
@@ -246,4 +321,35 @@ static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp,
 out:
        return status;
 }
+
+/*
+ * Decode LAYOUTSTATS request
+ */
+static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
+                                   struct xdr_stream *xdr,
+                                   struct nfs42_layoutstat_res *res)
+{
+       struct compound_hdr hdr;
+       int status, i;
+
+       status = decode_compound_hdr(xdr, &hdr);
+       if (status)
+               goto out;
+       status = decode_sequence(xdr, &res->seq_res, rqstp);
+       if (status)
+               goto out;
+       status = decode_putfh(xdr);
+       if (status)
+               goto out;
+       WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
+       for (i = 0; i < res->num_dev; i++) {
+               status = decode_layoutstats(xdr, res);
+               if (status)
+                       goto out;
+       }
+out:
+       res->rpc_status = status;
+       return status;
+}
+
 #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
index fdef424b0cd3c6120ddc7d9e379e49647b7b24fb..ea3bee919a765840a267f8fc59ccdec4ef61f676 100644 (file)
@@ -233,6 +233,7 @@ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception
 extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *,
                          struct rpc_message *, struct nfs4_sequence_args *,
                          struct nfs4_sequence_res *, int);
+extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int);
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
 extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
index e42be52a8c18d8121c934a5ac79483e77166206c..3aa6a9ba51136f31f30dea29d60dded106b05241 100644 (file)
@@ -676,7 +676,6 @@ found:
                break;
        }
 
-       /* No matching nfs_client found. */
        spin_unlock(&nn->nfs_client_lock);
        dprintk("NFS: <-- %s status = %d\n", __func__, status);
        nfs_put_client(prev);
index f58c17b3b480367c6322359ae7ca4b33b3695348..dcd39d4e2efebd78eed64d4df00fd2745f747027 100644 (file)
@@ -41,6 +41,10 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 
        dprintk("NFS: open file(%pd2)\n", dentry);
 
+       err = nfs_check_flags(openflags);
+       if (err)
+               return err;
+
        if ((openflags & O_ACCMODE) == 3)
                openflags--;
 
index c0b3a16b4a00806f79ea9eb28b6933a8d94bcc52..039b3eb6d83404f33224961465406d52203ff570 100644 (file)
@@ -35,13 +35,6 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p
                goto out;
        }
 
-       if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
-               printk(KERN_ERR "nfs4_get_rootfh:"
-                      " getroot obtained referral\n");
-               ret = -EREMOTE;
-               goto out;
-       }
-
        memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
 out:
        nfs_free_fattr(fsinfo.fattr);
index 2e1737c40a29837488823e04f27db975e0a51b9b..535dfc69c628f825cc4422339406b1365f66e8d4 100644 (file)
@@ -494,12 +494,7 @@ nfs_idmap_delete(struct nfs_client *clp)
 
 int nfs_idmap_init(void)
 {
-       int ret;
-       ret = nfs_idmap_init_keyring();
-       if (ret != 0)
-               goto out;
-out:
-       return ret;
+       return nfs_idmap_init_keyring();
 }
 
 void nfs_idmap_quit(void)
index 55e1e3af23a3d3f2313f977b185eb8c3f8ccbc6d..6f228b5af819ea576240c40869c1da74d823e460 100644 (file)
@@ -356,6 +356,9 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
                case 0:
                        return 0;
                case -NFS4ERR_OPENMODE:
+               case -NFS4ERR_DELEG_REVOKED:
+               case -NFS4ERR_ADMIN_REVOKED:
+               case -NFS4ERR_BAD_STATEID:
                        if (inode && nfs4_have_delegation(inode, FMODE_READ)) {
                                nfs4_inode_return_delegation(inode);
                                exception->retry = 1;
@@ -367,15 +370,6 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
                        if (ret < 0)
                                break;
                        goto wait_on_recovery;
-               case -NFS4ERR_DELEG_REVOKED:
-               case -NFS4ERR_ADMIN_REVOKED:
-               case -NFS4ERR_BAD_STATEID:
-                       if (state == NULL)
-                               break;
-                       ret = nfs4_schedule_stateid_recovery(server, state);
-                       if (ret < 0)
-                               break;
-                       goto wait_on_recovery;
                case -NFS4ERR_EXPIRED:
                        if (state != NULL) {
                                ret = nfs4_schedule_stateid_recovery(server, state);
@@ -482,8 +476,8 @@ struct nfs4_call_sync_data {
        struct nfs4_sequence_res *seq_res;
 };
 
-static void nfs4_init_sequence(struct nfs4_sequence_args *args,
-                              struct nfs4_sequence_res *res, int cache_reply)
+void nfs4_init_sequence(struct nfs4_sequence_args *args,
+                       struct nfs4_sequence_res *res, int cache_reply)
 {
        args->sa_slot = NULL;
        args->sa_cache_this = cache_reply;
@@ -1553,6 +1547,13 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod
        struct nfs4_state *newstate;
        int ret;
 
+       if ((opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR ||
+            opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEG_CUR_FH) &&
+           (opendata->o_arg.u.delegation_type & fmode) != fmode)
+               /* This mode can't have been delegated, so we must have
+                * a valid open_stateid to cover it - not need to reclaim.
+                */
+               return 0;
        opendata->o_arg.open_flags = 0;
        opendata->o_arg.fmode = fmode;
        opendata->o_arg.share_access = nfs4_map_atomic_open_share(
@@ -1684,6 +1685,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
                                        "%d.\n", __func__, err);
                case 0:
                case -ENOENT:
+               case -EAGAIN:
                case -ESTALE:
                        break;
                case -NFS4ERR_BADSESSION:
@@ -3355,6 +3357,8 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
                        goto out;
                case -NFS4ERR_MOVED:
                        err = nfs4_get_referral(client, dir, name, fattr, fhandle);
+                       if (err == -NFS4ERR_MOVED)
+                               err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception);
                        goto out;
                case -NFS4ERR_WRONGSEC:
                        err = -EPERM;
@@ -4955,49 +4959,128 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
        memcpy(bootverf->data, verf, sizeof(bootverf->data));
 }
 
-static unsigned int
-nfs4_init_nonuniform_client_string(struct nfs_client *clp,
-                                  char *buf, size_t len)
+static int
+nfs4_init_nonuniform_client_string(struct nfs_client *clp)
 {
-       unsigned int result;
+       int result;
+       size_t len;
+       char *str;
+       bool retried = false;
 
        if (clp->cl_owner_id != NULL)
-               return strlcpy(buf, clp->cl_owner_id, len);
+               return 0;
+retry:
+       rcu_read_lock();
+       len = 10 + strlen(clp->cl_ipaddr) + 1 +
+               strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
+               1 +
+               strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) +
+               1;
+       rcu_read_unlock();
+
+       if (len > NFS4_OPAQUE_LIMIT + 1)
+               return -EINVAL;
+
+       /*
+        * Since this string is allocated at mount time, and held until the
+        * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+        * about a memory-reclaim deadlock.
+        */
+       str = kmalloc(len, GFP_KERNEL);
+       if (!str)
+               return -ENOMEM;
 
        rcu_read_lock();
-       result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s",
-                               clp->cl_ipaddr,
-                               rpc_peeraddr2str(clp->cl_rpcclient,
-                                                       RPC_DISPLAY_ADDR),
-                               rpc_peeraddr2str(clp->cl_rpcclient,
-                                                       RPC_DISPLAY_PROTO));
+       result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
+                       clp->cl_ipaddr,
+                       rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+                       rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
        rcu_read_unlock();
-       clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
-       return result;
+
+       /* Did something change? */
+       if (result >= len) {
+               kfree(str);
+               if (retried)
+                       return -EINVAL;
+               retried = true;
+               goto retry;
+       }
+       clp->cl_owner_id = str;
+       return 0;
 }
 
-static unsigned int
-nfs4_init_uniform_client_string(struct nfs_client *clp,
-                               char *buf, size_t len)
+static int
+nfs4_init_uniquifier_client_string(struct nfs_client *clp)
+{
+       int result;
+       size_t len;
+       char *str;
+
+       len = 10 + 10 + 1 + 10 + 1 +
+               strlen(nfs4_client_id_uniquifier) + 1 +
+               strlen(clp->cl_rpcclient->cl_nodename) + 1;
+
+       if (len > NFS4_OPAQUE_LIMIT + 1)
+               return -EINVAL;
+
+       /*
+        * Since this string is allocated at mount time, and held until the
+        * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+        * about a memory-reclaim deadlock.
+        */
+       str = kmalloc(len, GFP_KERNEL);
+       if (!str)
+               return -ENOMEM;
+
+       result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
+                       clp->rpc_ops->version, clp->cl_minorversion,
+                       nfs4_client_id_uniquifier,
+                       clp->cl_rpcclient->cl_nodename);
+       if (result >= len) {
+               kfree(str);
+               return -EINVAL;
+       }
+       clp->cl_owner_id = str;
+       return 0;
+}
+
+static int
+nfs4_init_uniform_client_string(struct nfs_client *clp)
 {
-       const char *nodename = clp->cl_rpcclient->cl_nodename;
-       unsigned int result;
+       int result;
+       size_t len;
+       char *str;
 
        if (clp->cl_owner_id != NULL)
-               return strlcpy(buf, clp->cl_owner_id, len);
+               return 0;
 
        if (nfs4_client_id_uniquifier[0] != '\0')
-               result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s",
-                               clp->rpc_ops->version,
-                               clp->cl_minorversion,
-                               nfs4_client_id_uniquifier,
-                               nodename);
-       else
-               result = scnprintf(buf, len, "Linux NFSv%u.%u %s",
-                               clp->rpc_ops->version, clp->cl_minorversion,
-                               nodename);
-       clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
-       return result;
+               return nfs4_init_uniquifier_client_string(clp);
+
+       len = 10 + 10 + 1 + 10 + 1 +
+               strlen(clp->cl_rpcclient->cl_nodename) + 1;
+
+       if (len > NFS4_OPAQUE_LIMIT + 1)
+               return -EINVAL;
+
+       /*
+        * Since this string is allocated at mount time, and held until the
+        * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+        * about a memory-reclaim deadlock.
+        */
+       str = kmalloc(len, GFP_KERNEL);
+       if (!str)
+               return -ENOMEM;
+
+       result = scnprintf(str, len, "Linux NFSv%u.%u %s",
+                       clp->rpc_ops->version, clp->cl_minorversion,
+                       clp->cl_rpcclient->cl_nodename);
+       if (result >= len) {
+               kfree(str);
+               return -EINVAL;
+       }
+       clp->cl_owner_id = str;
+       return 0;
 }
 
 /*
@@ -5044,7 +5127,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
        struct nfs4_setclientid setclientid = {
                .sc_verifier = &sc_verifier,
                .sc_prog = program,
-               .sc_cb_ident = clp->cl_cb_ident,
+               .sc_clnt = clp,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
@@ -5064,16 +5147,15 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 
        /* nfs_client_id4 */
        nfs4_init_boot_verifier(clp, &sc_verifier);
+
        if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags))
-               setclientid.sc_name_len =
-                               nfs4_init_uniform_client_string(clp,
-                                               setclientid.sc_name,
-                                               sizeof(setclientid.sc_name));
+               status = nfs4_init_uniform_client_string(clp);
        else
-               setclientid.sc_name_len =
-                               nfs4_init_nonuniform_client_string(clp,
-                                               setclientid.sc_name,
-                                               sizeof(setclientid.sc_name));
+               status = nfs4_init_nonuniform_client_string(clp);
+
+       if (status)
+               goto out;
+
        /* cb_client4 */
        setclientid.sc_netid_len =
                                nfs4_init_callback_netid(clp,
@@ -5083,9 +5165,9 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
                                sizeof(setclientid.sc_uaddr), "%s.%u.%u",
                                clp->cl_ipaddr, port >> 8, port & 255);
 
-       dprintk("NFS call  setclientid auth=%s, '%.*s'\n",
+       dprintk("NFS call  setclientid auth=%s, '%s'\n",
                clp->cl_rpcclient->cl_auth->au_ops->au_name,
-               setclientid.sc_name_len, setclientid.sc_name);
+               clp->cl_owner_id);
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task)) {
                status = PTR_ERR(task);
@@ -5402,6 +5484,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
        atomic_inc(&lsp->ls_count);
        /* Ensure we don't close file until we're done freeing locks! */
        p->ctx = get_nfs_open_context(ctx);
+       get_file(fl->fl_file);
        memcpy(&p->fl, fl, sizeof(p->fl));
        p->server = NFS_SERVER(inode);
        return p;
@@ -5413,6 +5496,7 @@ static void nfs4_locku_release_calldata(void *data)
        nfs_free_seqid(calldata->arg.seqid);
        nfs4_put_lock_state(calldata->lsp);
        put_nfs_open_context(calldata->ctx);
+       fput(calldata->fl.fl_file);
        kfree(calldata);
 }
 
@@ -6846,11 +6930,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
        };
 
        nfs4_init_boot_verifier(clp, &verifier);
-       args.id_len = nfs4_init_uniform_client_string(clp, args.id,
-                                                       sizeof(args.id));
-       dprintk("NFS call  exchange_id auth=%s, '%.*s'\n",
+
+       status = nfs4_init_uniform_client_string(clp);
+       if (status)
+               goto out;
+
+       dprintk("NFS call  exchange_id auth=%s, '%s'\n",
                clp->cl_rpcclient->cl_auth->au_ops->au_name,
-               args.id_len, args.id);
+               clp->cl_owner_id);
 
        res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
                                        GFP_NOFS);
@@ -6885,7 +6972,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
                /* unsupported! */
                WARN_ON_ONCE(1);
                status = -EINVAL;
-               goto out_server_scope;
+               goto out_impl_id;
        }
 
        status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
@@ -6913,6 +7000,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
                /* use the most recent implementation id */
                kfree(clp->cl_implid);
                clp->cl_implid = res.impl_id;
+               res.impl_id = NULL;
 
                if (clp->cl_serverscope != NULL &&
                    !nfs41_same_server_scope(clp->cl_serverscope,
@@ -6926,15 +7014,16 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 
                if (clp->cl_serverscope == NULL) {
                        clp->cl_serverscope = res.server_scope;
-                       goto out;
+                       res.server_scope = NULL;
                }
-       } else
-               kfree(res.impl_id);
+       }
 
-out_server_owner:
-       kfree(res.server_owner);
+out_impl_id:
+       kfree(res.impl_id);
 out_server_scope:
        kfree(res.server_scope);
+out_server_owner:
+       kfree(res.server_owner);
 out:
        if (clp->cl_implid != NULL)
                dprintk("NFS reply exchange_id: Server Implementation ID: "
@@ -8061,9 +8150,8 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
        struct rpc_task *task;
        int status = 0;
 
-       dprintk("NFS: %4d initiating layoutcommit call. sync %d "
-               "lbw: %llu inode %lu\n",
-               data->task.tk_pid, sync,
+       dprintk("NFS: initiating layoutcommit call. sync %d "
+               "lbw: %llu inode %lu\n", sync,
                data->args.lastbytewritten,
                data->args.inode->i_ino);
 
@@ -8557,7 +8645,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
                | NFS_CAP_ATOMIC_OPEN_V1
                | NFS_CAP_ALLOCATE
                | NFS_CAP_DEALLOCATE
-               | NFS_CAP_SEEK,
+               | NFS_CAP_SEEK
+               | NFS_CAP_LAYOUTSTATS,
        .init_client = nfs41_init_client,
        .shutdown_client = nfs41_shutdown_client,
        .match_stateid = nfs41_match_stateid,
index 2782cfca22650922e012a4f86a1755e3cca68243..605840dc89cf9e28c173659af201aab109f9328d 100644 (file)
@@ -309,7 +309,6 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 
        if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
                goto do_confirm;
-       nfs4_begin_drain_session(clp);
        status = nfs4_proc_exchange_id(clp, cred);
        if (status != 0)
                goto out;
@@ -1482,6 +1481,8 @@ restart:
                                        spin_unlock(&state->state_lock);
                                }
                                nfs4_put_open_state(state);
+                               clear_bit(NFS4CLNT_RECLAIM_NOGRACE,
+                                       &state->flags);
                                spin_lock(&sp->so_lock);
                                goto restart;
                        }
@@ -1830,6 +1831,7 @@ static int nfs4_establish_lease(struct nfs_client *clp)
                clp->cl_mvops->reboot_recovery_ops;
        int status;
 
+       nfs4_begin_drain_session(clp);
        cred = nfs4_get_clid_cred(clp);
        if (cred == NULL)
                return -ENOENT;
index 0aea97841d3038b56056d0d7fcd0dcddb11f584e..558cd65dbdb752d111b5b85649b72bae36fdf040 100644 (file)
@@ -139,7 +139,8 @@ static int nfs4_stat_to_errno(int);
 #define encode_setclientid_maxsz \
                                (op_encode_hdr_maxsz + \
                                XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
-                               XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \
+                               /* client name */ \
+                               1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
                                1 /* sc_prog */ + \
                                1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
                                1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \
@@ -288,7 +289,8 @@ static int nfs4_stat_to_errno(int);
 #define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
                                encode_verifier_maxsz + \
                                1 /* co_ownerid.len */ + \
-                               XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \
+                               /* eia_clientowner */ \
+                               1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
                                1 /* flags */ + \
                                1 /* spa_how */ + \
                                /* max is SP4_MACH_CRED (for now) */ + \
@@ -1667,13 +1669,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
        encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr);
        encode_nfs4_verifier(xdr, setclientid->sc_verifier);
 
-       encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
+       encode_string(xdr, strlen(setclientid->sc_clnt->cl_owner_id),
+                       setclientid->sc_clnt->cl_owner_id);
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(setclientid->sc_prog);
        encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
        encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
        p = reserve_space(xdr, 4);
-       *p = cpu_to_be32(setclientid->sc_cb_ident);
+       *p = cpu_to_be32(setclientid->sc_clnt->cl_cb_ident);
 }
 
 static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
@@ -1747,7 +1750,8 @@ static void encode_exchange_id(struct xdr_stream *xdr,
        encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
        encode_nfs4_verifier(xdr, args->verifier);
 
-       encode_string(xdr, args->id_len, args->id);
+       encode_string(xdr, strlen(args->client->cl_owner_id),
+                       args->client->cl_owner_id);
 
        encode_uint32(xdr, args->flags);
        encode_uint32(xdr, args->state_protect.how);
@@ -7427,6 +7431,7 @@ struct rpc_procinfo       nfs4_procedures[] = {
        PROC(SEEK,              enc_seek,               dec_seek),
        PROC(ALLOCATE,          enc_allocate,           dec_allocate),
        PROC(DEALLOCATE,        enc_deallocate,         dec_deallocate),
+       PROC(LAYOUTSTATS,       enc_layoutstats,        dec_layoutstats),
 #endif /* CONFIG_NFS_V4_2 */
 };
 
index 282b3936951060a2c8a6216c4c690e166a5fcb8d..1da68d3b1edabdb78c60527f502af5f40d6cf69b 100644 (file)
@@ -636,9 +636,8 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
 
        hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
 
-       dprintk("NFS: %5u initiated pgio call "
+       dprintk("NFS: initiated pgio call "
                "(req %s/%llu, %u bytes @ offset %llu)\n",
-               hdr->task.tk_pid,
                hdr->inode->i_sb->s_id,
                (unsigned long long)NFS_FILEID(hdr->inode),
                hdr->args.count,
@@ -690,8 +689,6 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
 static void nfs_pgio_release(void *calldata)
 {
        struct nfs_pgio_header *hdr = calldata;
-       if (hdr->rw_ops->rw_release)
-               hdr->rw_ops->rw_release(hdr);
        nfs_pgio_data_destroy(hdr);
        hdr->completion_ops->completion(hdr);
 }
@@ -711,7 +708,9 @@ static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror,
  * nfs_pageio_init - initialise a page io descriptor
  * @desc: pointer to descriptor
  * @inode: pointer to inode
- * @doio: pointer to io function
+ * @pg_ops: pointer to pageio operations
+ * @compl_ops: pointer to pageio completion operations
+ * @rw_ops: pointer to nfs read/write operations
  * @bsize: io block size
  * @io_flags: extra parameters for the io function
  */
@@ -1186,6 +1185,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
  * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an
  *                             nfs_pageio_descriptor
  * @desc: pointer to io descriptor
+ * @mirror_idx: pointer to mirror index
  */
 static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
                                       u32 mirror_idx)
index 230606243be6ad079733e583d173b14d3baeda55..0ba9a02c95664960f8c0f46ea97249bd8653fe16 100644 (file)
@@ -35,6 +35,7 @@
 #include "iostat.h"
 #include "nfs4trace.h"
 #include "delegation.h"
+#include "nfs42.h"
 
 #define NFSDBG_FACILITY                NFSDBG_PNFS
 #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
@@ -1821,6 +1822,7 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
        /* Resend all requests through the MDS */
        nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
                              hdr->completion_ops);
+       set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags);
        return nfs_pageio_resend(&pgio, hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
@@ -1865,6 +1867,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
                mirror->pg_recoalesce = 1;
        }
        nfs_pgio_data_destroy(hdr);
+       hdr->release(hdr);
 }
 
 static enum pnfs_try_status
@@ -1979,6 +1982,7 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
                mirror->pg_recoalesce = 1;
        }
        nfs_pgio_data_destroy(hdr);
+       hdr->release(hdr);
 }
 
 /*
@@ -2247,3 +2251,63 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
        }
        return thp;
 }
+
+#if IS_ENABLED(CONFIG_NFS_V4_2)
+int
+pnfs_report_layoutstat(struct inode *inode)
+{
+       struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+       struct nfs_server *server = NFS_SERVER(inode);
+       struct nfs_inode *nfsi = NFS_I(inode);
+       struct nfs42_layoutstat_data *data;
+       struct pnfs_layout_hdr *hdr;
+       int status = 0;
+
+       if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
+               goto out;
+
+       if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
+               goto out;
+
+       if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
+               goto out;
+
+       spin_lock(&inode->i_lock);
+       if (!NFS_I(inode)->layout) {
+               spin_unlock(&inode->i_lock);
+               goto out;
+       }
+       hdr = NFS_I(inode)->layout;
+       pnfs_get_layout_hdr(hdr);
+       spin_unlock(&inode->i_lock);
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data) {
+               status = -ENOMEM;
+               goto out_put;
+       }
+
+       data->args.fh = NFS_FH(inode);
+       data->args.inode = inode;
+       nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid);
+       status = ld->prepare_layoutstats(&data->args);
+       if (status)
+               goto out_free;
+
+       status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data);
+
+out:
+       dprintk("%s returns %d\n", __func__, status);
+       return status;
+
+out_free:
+       kfree(data);
+out_put:
+       pnfs_put_layout_hdr(hdr);
+       smp_mb__before_atomic();
+       clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
+       smp_mb__after_atomic();
+       goto out;
+}
+EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
+#endif
index 1e6308f82fc3d5887de850e72226233e4f2138d2..3e6ab7bfbabd428425227b6f9d2a94711edd371e 100644 (file)
@@ -178,6 +178,8 @@ struct pnfs_layoutdriver_type {
        void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
                                     struct xdr_stream *xdr,
                                     const struct nfs4_layoutcommit_args *args);
+       int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
+       void (*cleanup_layoutstats) (struct nfs42_layoutstat_data *data);
 };
 
 struct pnfs_layout_hdr {
@@ -290,7 +292,6 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
 void pnfs_error_mark_layout_for_return(struct inode *inode,
                                       struct pnfs_layout_segment *lseg);
-
 /* nfs4_deviceid_flags */
 enum {
        NFS_DEVICEID_INVALID = 0,       /* set when MDS clientid recalled */
@@ -689,4 +690,14 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
 
 #endif /* CONFIG_NFS_V4_1 */
 
+#if IS_ENABLED(CONFIG_NFS_V4_2)
+int pnfs_report_layoutstat(struct inode *inode);
+#else
+static inline int
+pnfs_report_layoutstat(struct inode *inode)
+{
+       return 0;
+}
+#endif
+
 #endif /* FS_NFS_PNFS_H */
index e6c262555e08a62aff65ef3baa04e9666e9f18c2..65869ca9c851dbf4f0b289ca84865a018c2b6e57 100644 (file)
@@ -1290,6 +1290,7 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
 static void nfs_redirty_request(struct nfs_page *req)
 {
        nfs_mark_request_dirty(req);
+       set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
        nfs_unlock_request(req);
        nfs_end_page_writeback(req);
        nfs_release_request(req);
@@ -1348,11 +1349,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
        NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
 }
 
-static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
-{
-       /* do nothing! */
-}
-
 /*
  * Special version of should_remove_suid() that ignores capabilities.
  */
@@ -1556,7 +1552,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
        /* Set up the initial task struct.  */
        nfs_ops->commit_setup(data, &msg);
 
-       dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+       dprintk("NFS: initiated commit call\n");
 
        nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client,
                NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg);
@@ -2013,7 +2009,6 @@ static const struct nfs_rw_ops nfs_rw_write_ops = {
        .rw_mode                = FMODE_WRITE,
        .rw_alloc_header        = nfs_writehdr_alloc,
        .rw_free_header         = nfs_writehdr_free,
-       .rw_release             = nfs_writeback_release_common,
        .rw_done                = nfs_writeback_done,
        .rw_result              = nfs_writeback_result,
        .rw_initiate            = nfs_initiate_write,
index 32201c269890433817f1fac83026d95fa760b996..b8e72aad919cfc72ea6710ac3786dc35bbe47201 100644 (file)
@@ -500,6 +500,7 @@ enum {
        NFSPROC4_CLNT_SEEK,
        NFSPROC4_CLNT_ALLOCATE,
        NFSPROC4_CLNT_DEALLOCATE,
+       NFSPROC4_CLNT_LAYOUTSTATS,
 };
 
 /* nfs41 types */
index b95f914ce083891325b6b8defa3f9995851cf76b..f91b5ade30c98fe8b03d06bc40deb0c434edb633 100644 (file)
@@ -219,6 +219,7 @@ struct nfs_inode {
 #define NFS_INO_COMMIT         (7)             /* inode is committing unstable writes */
 #define NFS_INO_LAYOUTCOMMIT   (9)             /* layoutcommit required */
 #define NFS_INO_LAYOUTCOMMITTING (10)          /* layoutcommit inflight */
+#define NFS_INO_LAYOUTSTATS    (11)            /* layoutstats inflight */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
index 5e1273d4de14064198489a7aaccb73a88f36e7b8..a2ea1491d3dfc487611445490fb10adf9972d777 100644 (file)
@@ -237,5 +237,6 @@ struct nfs_server {
 #define NFS_CAP_SEEK           (1U << 19)
 #define NFS_CAP_ALLOCATE       (1U << 20)
 #define NFS_CAP_DEALLOCATE     (1U << 21)
+#define NFS_CAP_LAYOUTSTATS    (1U << 22)
 
 #endif
index 3eb072dbce833dd268b0189b9ff98a2aa85cc7bc..f2f650f136ee6fe181dfa27b71d7b944fc1f9357 100644 (file)
@@ -67,7 +67,6 @@ struct nfs_rw_ops {
        const fmode_t rw_mode;
        struct nfs_pgio_header *(*rw_alloc_header)(void);
        void (*rw_free_header)(struct nfs_pgio_header *);
-       void (*rw_release)(struct nfs_pgio_header *);
        int  (*rw_done)(struct rpc_task *, struct nfs_pgio_header *,
                        struct inode *);
        void (*rw_result)(struct rpc_task *, struct nfs_pgio_header *);
index 93ab6071bbe967b56ea44c56111157f8a9135e3f..7bbe50504211d65cc096baa8bc6d45e2e9449125 100644 (file)
@@ -316,6 +316,49 @@ struct nfs4_layoutreturn {
        int rpc_status;
 };
 
+#define PNFS_LAYOUTSTATS_MAXSIZE 256
+
+struct nfs42_layoutstat_args;
+struct nfs42_layoutstat_devinfo;
+typedef        void (*layoutstats_encode_t)(struct xdr_stream *,
+               struct nfs42_layoutstat_args *,
+               struct nfs42_layoutstat_devinfo *);
+
+/* Per file per deviceid layoutstats */
+struct nfs42_layoutstat_devinfo {
+       struct nfs4_deviceid dev_id;
+       __u64 offset;
+       __u64 length;
+       __u64 read_count;
+       __u64 read_bytes;
+       __u64 write_count;
+       __u64 write_bytes;
+       __u32 layout_type;
+       layoutstats_encode_t layoutstats_encode;
+       void *layout_private;
+};
+
+struct nfs42_layoutstat_args {
+       struct nfs4_sequence_args seq_args;
+       struct nfs_fh *fh;
+       struct inode *inode;
+       nfs4_stateid stateid;
+       int num_dev;
+       struct nfs42_layoutstat_devinfo *devinfo;
+};
+
+struct nfs42_layoutstat_res {
+       struct nfs4_sequence_res seq_res;
+       int num_dev;
+       int rpc_status;
+};
+
+struct nfs42_layoutstat_data {
+       struct inode *inode;
+       struct nfs42_layoutstat_args args;
+       struct nfs42_layoutstat_res res;
+};
+
 struct stateowner_id {
        __u64   create_time;
        __u32   uniquifier;
@@ -984,17 +1027,14 @@ struct nfs4_readlink_res {
        struct nfs4_sequence_res        seq_res;
 };
 
-#define NFS4_SETCLIENTID_NAMELEN       (127)
 struct nfs4_setclientid {
        const nfs4_verifier *           sc_verifier;
-       unsigned int                    sc_name_len;
-       char                            sc_name[NFS4_SETCLIENTID_NAMELEN + 1];
        u32                             sc_prog;
        unsigned int                    sc_netid_len;
        char                            sc_netid[RPCBIND_MAXNETIDLEN + 1];
        unsigned int                    sc_uaddr_len;
        char                            sc_uaddr[RPCBIND_MAXUADDRLEN + 1];
-       u32                             sc_cb_ident;
+       struct nfs_client               *sc_clnt;
        struct rpc_cred                 *sc_cred;
 };
 
@@ -1142,12 +1182,9 @@ struct nfs41_state_protection {
        struct nfs4_op_map allow;
 };
 
-#define NFS4_EXCHANGE_ID_LEN   (48)
 struct nfs41_exchange_id_args {
        struct nfs_client               *client;
        nfs4_verifier                   *verifier;
-       unsigned int                    id_len;
-       char                            id[NFS4_EXCHANGE_ID_LEN];
        u32                             flags;
        struct nfs41_state_protection   state_protect;
 };
index 2ca67b55e0fe2f0abf7e432ee8402608b6d57201..8df43c9f11dc295889639364101f324150d769ff 100644 (file)
@@ -37,7 +37,6 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied);
 void xprt_free_bc_request(struct rpc_rqst *req);
 int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs);
 void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs);
-int bc_send(struct rpc_rqst *req);
 
 /*
  * Determine if a shared backchannel is in use
index 598ba80ec30c974f02477a216a21077213a25aa5..131032f15cc187e0c7cbd0df80a08f4c44220800 100644 (file)
@@ -56,6 +56,7 @@ struct rpc_clnt {
        struct rpc_rtt *        cl_rtt;         /* RTO estimator data */
        const struct rpc_timeout *cl_timeout;   /* Timeout strategy */
 
+       atomic_t                cl_swapper;     /* swapfile count */
        int                     cl_nodelen;     /* nodename length */
        char                    cl_nodename[UNX_MAXNODENAME+1];
        struct rpc_pipe_dir_head cl_pipedir_objects;
index 5f1e6bd4c316d143751d19aac15af77aa7c1ac21..d703f0ef37d8f87436310247c19ca37f60ea692b 100644 (file)
@@ -205,8 +205,7 @@ struct rpc_wait_queue {
  */
 struct rpc_task *rpc_new_task(const struct rpc_task_setup *);
 struct rpc_task *rpc_run_task(const struct rpc_task_setup *);
-struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
-                               const struct rpc_call_ops *ops);
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req);
 void           rpc_put_task(struct rpc_task *);
 void           rpc_put_task_async(struct rpc_task *);
 void           rpc_exit_task(struct rpc_task *);
@@ -269,4 +268,20 @@ static inline void rpc_assign_waitqueue_name(struct rpc_wait_queue *q,
 }
 #endif
 
+#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
+int rpc_clnt_swap_activate(struct rpc_clnt *clnt);
+void rpc_clnt_swap_deactivate(struct rpc_clnt *clnt);
+#else
+static inline int
+rpc_clnt_swap_activate(struct rpc_clnt *clnt)
+{
+       return -EINVAL;
+}
+
+static inline void
+rpc_clnt_swap_deactivate(struct rpc_clnt *clnt)
+{
+}
+#endif /* CONFIG_SUNRPC_SWAP */
+
 #endif /* _LINUX_SUNRPC_SCHED_H_ */
index 8b93ef53df3c95df08625bef6ba58341d583e1e0..0fb9acbb478095b576445d74d264ad5e5193a55e 100644 (file)
@@ -133,6 +133,9 @@ struct rpc_xprt_ops {
        void            (*close)(struct rpc_xprt *xprt);
        void            (*destroy)(struct rpc_xprt *xprt);
        void            (*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq);
+       int             (*enable_swap)(struct rpc_xprt *xprt);
+       void            (*disable_swap)(struct rpc_xprt *xprt);
+       void            (*inject_disconnect)(struct rpc_xprt *xprt);
 };
 
 /*
@@ -180,7 +183,7 @@ struct rpc_xprt {
        atomic_t                num_reqs;       /* total slots */
        unsigned long           state;          /* transport state */
        unsigned char           resvport   : 1; /* use a reserved port */
-       unsigned int            swapper;        /* we're swapping over this
+       atomic_t                swapper;        /* we're swapping over this
                                                   transport */
        unsigned int            bind_index;     /* bind function index */
 
@@ -212,7 +215,8 @@ struct rpc_xprt {
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
        struct svc_serv         *bc_serv;       /* The RPC service which will */
                                                /* process the callback */
-       unsigned int            bc_alloc_count; /* Total number of preallocs */
+       int                     bc_alloc_count; /* Total number of preallocs */
+       atomic_t                bc_free_slots;
        spinlock_t              bc_pa_lock;     /* Protects the preallocated
                                                 * items */
        struct list_head        bc_pa_list;     /* List of preallocated
@@ -241,6 +245,7 @@ struct rpc_xprt {
        const char              *address_strings[RPC_DISPLAY_MAX];
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
        struct dentry           *debugfs;               /* debugfs directory */
+       atomic_t                inject_disconnect;
 #endif
 };
 
@@ -327,6 +332,18 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *
        return p + xprt->tsh_size;
 }
 
+static inline int
+xprt_enable_swap(struct rpc_xprt *xprt)
+{
+       return xprt->ops->enable_swap(xprt);
+}
+
+static inline void
+xprt_disable_swap(struct rpc_xprt *xprt)
+{
+       xprt->ops->disable_swap(xprt);
+}
+
 /*
  * Transport switch helper functions
  */
@@ -345,7 +362,6 @@ void                        xprt_release_rqst_cong(struct rpc_task *task);
 void                   xprt_disconnect_done(struct rpc_xprt *xprt);
 void                   xprt_force_disconnect(struct rpc_xprt *xprt);
 void                   xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
-int                    xs_swapper(struct rpc_xprt *xprt, int enable);
 
 bool                   xprt_lock_connect(struct rpc_xprt *, struct rpc_task *, void *);
 void                   xprt_unlock_connect(struct rpc_xprt *, void *);
@@ -431,6 +447,23 @@ static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt)
        return test_and_set_bit(XPRT_BINDING, &xprt->state);
 }
 
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+extern unsigned int rpc_inject_disconnect;
+static inline void xprt_inject_disconnect(struct rpc_xprt *xprt)
+{
+       if (!rpc_inject_disconnect)
+               return;
+       if (atomic_dec_return(&xprt->inject_disconnect))
+               return;
+       atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect);
+       xprt->ops->inject_disconnect(xprt);
+}
+#else
+static inline void xprt_inject_disconnect(struct rpc_xprt *xprt)
+{
+}
+#endif
+
 #endif /* __KERNEL__*/
 
 #endif /* _LINUX_SUNRPC_XPRT_H */
index c984c85981eae2881ebdac43ffdbfba9b55b70b4..b17613052cc3fd9d8827ede1944d3489bdd2285d 100644 (file)
@@ -56,7 +56,8 @@
 
 #define RPCRDMA_INLINE_PAD_THRESH  (512)/* payload threshold to pad (bytes) */
 
-/* memory registration strategies */
+/* Memory registration strategies, by number.
+ * This is part of a kernel / user space API. Do not remove. */
 enum rpcrdma_memreg {
        RPCRDMA_BOUNCEBUFFERS = 0,
        RPCRDMA_REGISTER,
index 936ad0a15371ac4f1e49b8785ef645fa65583e83..b512fbd9d79a403ee980d40c1a21fc3fde47f215 100644 (file)
@@ -14,6 +14,6 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
            sunrpc_syms.o cache.o rpc_pipe.o \
            svc_xprt.o
 sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o
-sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o bc_svc.o
+sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o
 sunrpc-$(CONFIG_PROC_FS) += stats.o
 sunrpc-$(CONFIG_SYSCTL) += sysctl.o
index 9dd0ea8db463acc9daba0c51be89b1f17ec8f17d..9825ff0f91d6c0bde819105f639cae21883bbfad 100644 (file)
@@ -37,16 +37,18 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 static inline int xprt_need_to_requeue(struct rpc_xprt *xprt)
 {
-       return xprt->bc_alloc_count > 0;
+       return xprt->bc_alloc_count < atomic_read(&xprt->bc_free_slots);
 }
 
 static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n)
 {
+       atomic_add(n, &xprt->bc_free_slots);
        xprt->bc_alloc_count += n;
 }
 
 static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n)
 {
+       atomic_sub(n, &xprt->bc_free_slots);
        return xprt->bc_alloc_count -= n;
 }
 
@@ -60,13 +62,62 @@ static void xprt_free_allocation(struct rpc_rqst *req)
 
        dprintk("RPC:        free allocations for req= %p\n", req);
        WARN_ON_ONCE(test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
-       xbufp = &req->rq_private_buf;
+       xbufp = &req->rq_rcv_buf;
        free_page((unsigned long)xbufp->head[0].iov_base);
        xbufp = &req->rq_snd_buf;
        free_page((unsigned long)xbufp->head[0].iov_base);
        kfree(req);
 }
 
+static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags)
+{
+       struct page *page;
+       /* Preallocate one XDR receive buffer */
+       page = alloc_page(gfp_flags);
+       if (page == NULL)
+               return -ENOMEM;
+       buf->head[0].iov_base = page_address(page);
+       buf->head[0].iov_len = PAGE_SIZE;
+       buf->tail[0].iov_base = NULL;
+       buf->tail[0].iov_len = 0;
+       buf->page_len = 0;
+       buf->len = 0;
+       buf->buflen = PAGE_SIZE;
+       return 0;
+}
+
+static
+struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt, gfp_t gfp_flags)
+{
+       struct rpc_rqst *req;
+
+       /* Pre-allocate one backchannel rpc_rqst */
+       req = kzalloc(sizeof(*req), gfp_flags);
+       if (req == NULL)
+               return NULL;
+
+       req->rq_xprt = xprt;
+       INIT_LIST_HEAD(&req->rq_list);
+       INIT_LIST_HEAD(&req->rq_bc_list);
+
+       /* Preallocate one XDR receive buffer */
+       if (xprt_alloc_xdr_buf(&req->rq_rcv_buf, gfp_flags) < 0) {
+               printk(KERN_ERR "Failed to create bc receive xbuf\n");
+               goto out_free;
+       }
+       req->rq_rcv_buf.len = PAGE_SIZE;
+
+       /* Preallocate one XDR send buffer */
+       if (xprt_alloc_xdr_buf(&req->rq_snd_buf, gfp_flags) < 0) {
+               printk(KERN_ERR "Failed to create bc snd xbuf\n");
+               goto out_free;
+       }
+       return req;
+out_free:
+       xprt_free_allocation(req);
+       return NULL;
+}
+
 /*
  * Preallocate up to min_reqs structures and related buffers for use
  * by the backchannel.  This function can be called multiple times
@@ -87,9 +138,7 @@ static void xprt_free_allocation(struct rpc_rqst *req)
  */
 int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
 {
-       struct page *page_rcv = NULL, *page_snd = NULL;
-       struct xdr_buf *xbufp = NULL;
-       struct rpc_rqst *req, *tmp;
+       struct rpc_rqst *req;
        struct list_head tmp_list;
        int i;
 
@@ -106,7 +155,7 @@ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
        INIT_LIST_HEAD(&tmp_list);
        for (i = 0; i < min_reqs; i++) {
                /* Pre-allocate one backchannel rpc_rqst */
-               req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL);
+               req = xprt_alloc_bc_req(xprt, GFP_KERNEL);
                if (req == NULL) {
                        printk(KERN_ERR "Failed to create bc rpc_rqst\n");
                        goto out_free;
@@ -115,41 +164,6 @@ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
                /* Add the allocated buffer to the tmp list */
                dprintk("RPC:       adding req= %p\n", req);
                list_add(&req->rq_bc_pa_list, &tmp_list);
-
-               req->rq_xprt = xprt;
-               INIT_LIST_HEAD(&req->rq_list);
-               INIT_LIST_HEAD(&req->rq_bc_list);
-
-               /* Preallocate one XDR receive buffer */
-               page_rcv = alloc_page(GFP_KERNEL);
-               if (page_rcv == NULL) {
-                       printk(KERN_ERR "Failed to create bc receive xbuf\n");
-                       goto out_free;
-               }
-               xbufp = &req->rq_rcv_buf;
-               xbufp->head[0].iov_base = page_address(page_rcv);
-               xbufp->head[0].iov_len = PAGE_SIZE;
-               xbufp->tail[0].iov_base = NULL;
-               xbufp->tail[0].iov_len = 0;
-               xbufp->page_len = 0;
-               xbufp->len = PAGE_SIZE;
-               xbufp->buflen = PAGE_SIZE;
-
-               /* Preallocate one XDR send buffer */
-               page_snd = alloc_page(GFP_KERNEL);
-               if (page_snd == NULL) {
-                       printk(KERN_ERR "Failed to create bc snd xbuf\n");
-                       goto out_free;
-               }
-
-               xbufp = &req->rq_snd_buf;
-               xbufp->head[0].iov_base = page_address(page_snd);
-               xbufp->head[0].iov_len = 0;
-               xbufp->tail[0].iov_base = NULL;
-               xbufp->tail[0].iov_len = 0;
-               xbufp->page_len = 0;
-               xbufp->len = 0;
-               xbufp->buflen = PAGE_SIZE;
        }
 
        /*
@@ -167,7 +181,10 @@ out_free:
        /*
         * Memory allocation failed, free the temporary list
         */
-       list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list) {
+       while (!list_empty(&tmp_list)) {
+               req = list_first_entry(&tmp_list,
+                               struct rpc_rqst,
+                               rq_bc_pa_list);
                list_del(&req->rq_bc_pa_list);
                xprt_free_allocation(req);
        }
@@ -217,9 +234,15 @@ static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid)
        struct rpc_rqst *req = NULL;
 
        dprintk("RPC:       allocate a backchannel request\n");
-       if (list_empty(&xprt->bc_pa_list))
+       if (atomic_read(&xprt->bc_free_slots) <= 0)
                goto not_found;
-
+       if (list_empty(&xprt->bc_pa_list)) {
+               req = xprt_alloc_bc_req(xprt, GFP_ATOMIC);
+               if (!req)
+                       goto not_found;
+               /* Note: this 'free' request adds it to xprt->bc_pa_list */
+               xprt_free_bc_request(req);
+       }
        req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
                                rq_bc_pa_list);
        req->rq_reply_bytes_recvd = 0;
@@ -245,11 +268,21 @@ void xprt_free_bc_request(struct rpc_rqst *req)
 
        req->rq_connect_cookie = xprt->connect_cookie - 1;
        smp_mb__before_atomic();
-       WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
        clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
        smp_mb__after_atomic();
 
-       if (!xprt_need_to_requeue(xprt)) {
+       /*
+        * Return it to the list of preallocations so that it
+        * may be reused by a new callback request.
+        */
+       spin_lock_bh(&xprt->bc_pa_lock);
+       if (xprt_need_to_requeue(xprt)) {
+               list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
+               xprt->bc_alloc_count++;
+               req = NULL;
+       }
+       spin_unlock_bh(&xprt->bc_pa_lock);
+       if (req != NULL) {
                /*
                 * The last remaining session was destroyed while this
                 * entry was in use.  Free the entry and don't attempt
@@ -260,14 +293,6 @@ void xprt_free_bc_request(struct rpc_rqst *req)
                xprt_free_allocation(req);
                return;
        }
-
-       /*
-        * Return it to the list of preallocations so that it
-        * may be reused by a new callback request.
-        */
-       spin_lock_bh(&xprt->bc_pa_lock);
-       list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
-       spin_unlock_bh(&xprt->bc_pa_lock);
 }
 
 /*
@@ -311,6 +336,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
 
        spin_lock(&xprt->bc_pa_lock);
        list_del(&req->rq_bc_pa_list);
+       xprt->bc_alloc_count--;
        spin_unlock(&xprt->bc_pa_lock);
 
        req->rq_private_buf.len = copied;
diff --git a/net/sunrpc/bc_svc.c b/net/sunrpc/bc_svc.c
deleted file mode 100644 (file)
index 15c7a8a..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-/******************************************************************************
-
-(c) 2007 Network Appliance, Inc.  All Rights Reserved.
-(c) 2009 NetApp.  All Rights Reserved.
-
-NetApp provides this source code under the GPL v2 License.
-The GPL v2 license is available at
-http://opensource.org/licenses/gpl-license.php.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-******************************************************************************/
-
-/*
- * The NFSv4.1 callback service helper routines.
- * They implement the transport level processing required to send the
- * reply over an existing open connection previously established by the client.
- */
-
-#include <linux/module.h>
-
-#include <linux/sunrpc/xprt.h>
-#include <linux/sunrpc/sched.h>
-#include <linux/sunrpc/bc_xprt.h>
-
-#define RPCDBG_FACILITY        RPCDBG_SVCDSP
-
-/* Empty callback ops */
-static const struct rpc_call_ops nfs41_callback_ops = {
-};
-
-
-/*
- * Send the callback reply
- */
-int bc_send(struct rpc_rqst *req)
-{
-       struct rpc_task *task;
-       int ret;
-
-       dprintk("RPC:       bc_send req= %p\n", req);
-       task = rpc_run_bc_task(req, &nfs41_callback_ops);
-       if (IS_ERR(task))
-               ret = PTR_ERR(task);
-       else {
-               WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
-               ret = task->tk_status;
-               rpc_put_task(task);
-       }
-       dprintk("RPC:       bc_send ret= %d\n", ret);
-       return ret;
-}
-
index e6ce1517367f884608640b2532080ab6566b9379..cbc6af923dd1cb0baabc95161989133150269d4f 100644 (file)
@@ -891,15 +891,8 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
                        task->tk_flags |= RPC_TASK_SOFT;
                if (clnt->cl_noretranstimeo)
                        task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT;
-               if (sk_memalloc_socks()) {
-                       struct rpc_xprt *xprt;
-
-                       rcu_read_lock();
-                       xprt = rcu_dereference(clnt->cl_xprt);
-                       if (xprt->swapper)
-                               task->tk_flags |= RPC_TASK_SWAPPER;
-                       rcu_read_unlock();
-               }
+               if (atomic_read(&clnt->cl_swapper))
+                       task->tk_flags |= RPC_TASK_SWAPPER;
                /* Add to the client's list of all tasks */
                spin_lock(&clnt->cl_lock);
                list_add_tail(&task->tk_task, &clnt->cl_tasks);
@@ -1031,15 +1024,14 @@ EXPORT_SYMBOL_GPL(rpc_call_async);
  * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run
  * rpc_execute against it
  * @req: RPC request
- * @tk_ops: RPC call ops
  */
-struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
-                               const struct rpc_call_ops *tk_ops)
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
 {
        struct rpc_task *task;
        struct xdr_buf *xbufp = &req->rq_snd_buf;
        struct rpc_task_setup task_setup_data = {
-               .callback_ops = tk_ops,
+               .callback_ops = &rpc_default_ops,
+               .flags = RPC_TASK_SOFTCONN,
        };
 
        dprintk("RPC: rpc_run_bc_task req= %p\n", req);
@@ -1614,6 +1606,7 @@ call_allocate(struct rpc_task *task)
                                        req->rq_callsize + req->rq_rcvsize);
        if (req->rq_buffer != NULL)
                return;
+       xprt_inject_disconnect(xprt);
 
        dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
 
@@ -1951,33 +1944,36 @@ call_bc_transmit(struct rpc_task *task)
 {
        struct rpc_rqst *req = task->tk_rqstp;
 
-       if (!xprt_prepare_transmit(task)) {
-               /*
-                * Could not reserve the transport. Try again after the
-                * transport is released.
-                */
-               task->tk_status = 0;
-               task->tk_action = call_bc_transmit;
-               return;
-       }
+       if (!xprt_prepare_transmit(task))
+               goto out_retry;
 
-       task->tk_action = rpc_exit_task;
        if (task->tk_status < 0) {
                printk(KERN_NOTICE "RPC: Could not send backchannel reply "
                        "error: %d\n", task->tk_status);
-               return;
+               goto out_done;
        }
+       if (req->rq_connect_cookie != req->rq_xprt->connect_cookie)
+               req->rq_bytes_sent = 0;
 
        xprt_transmit(task);
+
+       if (task->tk_status == -EAGAIN)
+               goto out_nospace;
+
        xprt_end_transmit(task);
        dprint_status(task);
        switch (task->tk_status) {
        case 0:
                /* Success */
-               break;
        case -EHOSTDOWN:
        case -EHOSTUNREACH:
        case -ENETUNREACH:
+       case -ECONNRESET:
+       case -ECONNREFUSED:
+       case -EADDRINUSE:
+       case -ENOTCONN:
+       case -EPIPE:
+               break;
        case -ETIMEDOUT:
                /*
                 * Problem reaching the server.  Disconnect and let the
@@ -2002,6 +1998,13 @@ call_bc_transmit(struct rpc_task *task)
                break;
        }
        rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
+out_done:
+       task->tk_action = rpc_exit_task;
+       return;
+out_nospace:
+       req->rq_connect_cookie = req->rq_xprt->connect_cookie;
+out_retry:
+       task->tk_status = 0;
 }
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 
@@ -2476,3 +2479,59 @@ void rpc_show_tasks(struct net *net)
        spin_unlock(&sn->rpc_client_lock);
 }
 #endif
+
+#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
+int
+rpc_clnt_swap_activate(struct rpc_clnt *clnt)
+{
+       int ret = 0;
+       struct rpc_xprt *xprt;
+
+       if (atomic_inc_return(&clnt->cl_swapper) == 1) {
+retry:
+               rcu_read_lock();
+               xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+               rcu_read_unlock();
+               if (!xprt) {
+                       /*
+                        * If we didn't get a reference, then we likely are
+                        * racing with a migration event. Wait for a grace
+                        * period and try again.
+                        */
+                       synchronize_rcu();
+                       goto retry;
+               }
+
+               ret = xprt_enable_swap(xprt);
+               xprt_put(xprt);
+       }
+       return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_swap_activate);
+
+void
+rpc_clnt_swap_deactivate(struct rpc_clnt *clnt)
+{
+       struct rpc_xprt *xprt;
+
+       if (atomic_dec_if_positive(&clnt->cl_swapper) == 0) {
+retry:
+               rcu_read_lock();
+               xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+               rcu_read_unlock();
+               if (!xprt) {
+                       /*
+                        * If we didn't get a reference, then we likely are
+                        * racing with a migration event. Wait for a grace
+                        * period and try again.
+                        */
+                       synchronize_rcu();
+                       goto retry;
+               }
+
+               xprt_disable_swap(xprt);
+               xprt_put(xprt);
+       }
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_swap_deactivate);
+#endif /* CONFIG_SUNRPC_SWAP */
index 82962f7e6e888f619ad79754f038732d5d5b6333..e7b4d93566df42dfa5ecf985152235e539ed9933 100644 (file)
 #include "netns.h"
 
 static struct dentry *topdir;
+static struct dentry *rpc_fault_dir;
 static struct dentry *rpc_clnt_dir;
 static struct dentry *rpc_xprt_dir;
 
+unsigned int rpc_inject_disconnect;
+
 struct rpc_clnt_iter {
        struct rpc_clnt *clnt;
        loff_t          pos;
@@ -257,6 +260,8 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt)
                debugfs_remove_recursive(xprt->debugfs);
                xprt->debugfs = NULL;
        }
+
+       atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect);
 }
 
 void
@@ -266,11 +271,79 @@ rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt)
        xprt->debugfs = NULL;
 }
 
+static int
+fault_open(struct inode *inode, struct file *filp)
+{
+       filp->private_data = kmalloc(128, GFP_KERNEL);
+       if (!filp->private_data)
+               return -ENOMEM;
+       return 0;
+}
+
+static int
+fault_release(struct inode *inode, struct file *filp)
+{
+       kfree(filp->private_data);
+       return 0;
+}
+
+static ssize_t
+fault_disconnect_read(struct file *filp, char __user *user_buf,
+                     size_t len, loff_t *offset)
+{
+       char *buffer = (char *)filp->private_data;
+       size_t size;
+
+       size = sprintf(buffer, "%u\n", rpc_inject_disconnect);
+       return simple_read_from_buffer(user_buf, len, offset, buffer, size);
+}
+
+static ssize_t
+fault_disconnect_write(struct file *filp, const char __user *user_buf,
+                      size_t len, loff_t *offset)
+{
+       char buffer[16];
+
+       if (len >= sizeof(buffer))
+               len = sizeof(buffer) - 1;
+       if (copy_from_user(buffer, user_buf, len))
+               return -EFAULT;
+       buffer[len] = '\0';
+       if (kstrtouint(buffer, 10, &rpc_inject_disconnect))
+               return -EINVAL;
+       return len;
+}
+
+static const struct file_operations fault_disconnect_fops = {
+       .owner          = THIS_MODULE,
+       .open           = fault_open,
+       .read           = fault_disconnect_read,
+       .write          = fault_disconnect_write,
+       .release        = fault_release,
+};
+
+static struct dentry *
+inject_fault_dir(struct dentry *topdir)
+{
+       struct dentry *faultdir;
+
+       faultdir = debugfs_create_dir("inject_fault", topdir);
+       if (!faultdir)
+               return NULL;
+
+       if (!debugfs_create_file("disconnect", S_IFREG | S_IRUSR, faultdir,
+                                NULL, &fault_disconnect_fops))
+               return NULL;
+
+       return faultdir;
+}
+
 void __exit
 sunrpc_debugfs_exit(void)
 {
        debugfs_remove_recursive(topdir);
        topdir = NULL;
+       rpc_fault_dir = NULL;
        rpc_clnt_dir = NULL;
        rpc_xprt_dir = NULL;
 }
@@ -282,6 +355,10 @@ sunrpc_debugfs_init(void)
        if (!topdir)
                return;
 
+       rpc_fault_dir = inject_fault_dir(topdir);
+       if (!rpc_fault_dir)
+               goto out_remove;
+
        rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir);
        if (!rpc_clnt_dir)
                goto out_remove;
@@ -294,5 +371,6 @@ sunrpc_debugfs_init(void)
 out_remove:
        debugfs_remove_recursive(topdir);
        topdir = NULL;
+       rpc_fault_dir = NULL;
        rpc_clnt_dir = NULL;
 }
index 852ae606b02a37760a5a4dc1fd1860b8a45b89a6..5a16d8d8c831c4ad2805f5958b9ccef63449af82 100644 (file)
@@ -1350,6 +1350,11 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 {
        struct kvec     *argv = &rqstp->rq_arg.head[0];
        struct kvec     *resv = &rqstp->rq_res.head[0];
+       struct rpc_task *task;
+       int proc_error;
+       int error;
+
+       dprintk("svc: %s(%p)\n", __func__, req);
 
        /* Build the svc_rqst used by the common processing routine */
        rqstp->rq_xprt = serv->sv_bc_xprt;
@@ -1372,21 +1377,36 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 
        /*
         * Skip the next two words because they've already been
-        * processed in the trasport
+        * processed in the transport
         */
        svc_getu32(argv);       /* XID */
        svc_getnl(argv);        /* CALLDIR */
 
-       /* Returns 1 for send, 0 for drop */
-       if (svc_process_common(rqstp, argv, resv)) {
-               memcpy(&req->rq_snd_buf, &rqstp->rq_res,
-                                               sizeof(req->rq_snd_buf));
-               return bc_send(req);
-       } else {
-               /* drop request */
+       /* Parse and execute the bc call */
+       proc_error = svc_process_common(rqstp, argv, resv);
+
+       atomic_inc(&req->rq_xprt->bc_free_slots);
+       if (!proc_error) {
+               /* Processing error: drop the request */
                xprt_free_bc_request(req);
                return 0;
        }
+
+       /* Finally, send the reply synchronously */
+       memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
+       task = rpc_run_bc_task(req);
+       if (IS_ERR(task)) {
+               error = PTR_ERR(task);
+               goto out;
+       }
+
+       WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
+       error = task->tk_status;
+       rpc_put_task(task);
+
+out:
+       dprintk("svc: %s(), error=%d\n", __func__, error);
+       return error;
 }
 EXPORT_SYMBOL_GPL(bc_svc_process);
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
index 1d4fe24af06a1115bd80538c5346ae2f843f1eb8..ab5dd621ae0c0795a0d86e1a9fb83c5cc2812486 100644 (file)
@@ -68,6 +68,7 @@ static void    xprt_init(struct rpc_xprt *xprt, struct net *net);
 static void    xprt_request_init(struct rpc_task *, struct rpc_xprt *);
 static void    xprt_connect_status(struct rpc_task *task);
 static int      __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
+static void     __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *);
 static void     xprt_destroy(struct rpc_xprt *xprt);
 
 static DEFINE_SPINLOCK(xprt_list_lock);
@@ -250,6 +251,8 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
        }
        xprt_clear_locked(xprt);
 out_sleep:
+       if (req)
+               __xprt_put_cong(xprt, req);
        dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt);
        task->tk_timeout = 0;
        task->tk_status = -EAGAIN;
@@ -608,8 +611,8 @@ static void xprt_autoclose(struct work_struct *work)
        struct rpc_xprt *xprt =
                container_of(work, struct rpc_xprt, task_cleanup);
 
-       xprt->ops->close(xprt);
        clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+       xprt->ops->close(xprt);
        xprt_release_write(xprt, NULL);
 }
 
@@ -967,6 +970,7 @@ void xprt_transmit(struct rpc_task *task)
                task->tk_status = status;
                return;
        }
+       xprt_inject_disconnect(xprt);
 
        dprintk("RPC: %5u xmit complete\n", task->tk_pid);
        task->tk_flags |= RPC_TASK_SENT;
@@ -1285,6 +1289,7 @@ void xprt_release(struct rpc_task *task)
        spin_unlock_bh(&xprt->transport_lock);
        if (req->rq_buffer)
                xprt->ops->buf_free(req->rq_buffer);
+       xprt_inject_disconnect(xprt);
        if (req->rq_cred != NULL)
                put_rpccred(req->rq_cred);
        task->tk_rqstp = NULL;
index 302d4ebf6fbfb2a2c15780ef966b52bb473b91e4..f1e8dafbd5079b3406a769ba4854ecba229edca6 100644 (file)
  * can take tens of usecs to complete.
  */
 
+/* Normal operation
+ *
+ * A Memory Region is prepared for RDMA READ or WRITE using the
+ * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is
+ * finished, the Memory Region is unmapped using the ib_unmap_fmr
+ * verb (fmr_op_unmap).
+ */
+
+/* Transport recovery
+ *
+ * After a transport reconnect, fmr_op_map re-uses the MR already
+ * allocated for the RPC, but generates a fresh rkey then maps the
+ * MR again. This process is synchronous.
+ */
+
 #include "xprt_rdma.h"
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -50,19 +65,28 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
        struct rpcrdma_mw *r;
        int i, rc;
 
+       spin_lock_init(&buf->rb_mwlock);
        INIT_LIST_HEAD(&buf->rb_mws);
        INIT_LIST_HEAD(&buf->rb_all);
 
-       i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
-       dprintk("RPC:       %s: initializing %d FMRs\n", __func__, i);
+       i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1);
+       i += 2;                         /* head + tail */
+       i *= buf->rb_max_requests;      /* one set for each RPC slot */
+       dprintk("RPC:       %s: initalizing %d FMRs\n", __func__, i);
 
+       rc = -ENOMEM;
        while (i--) {
                r = kzalloc(sizeof(*r), GFP_KERNEL);
                if (!r)
-                       return -ENOMEM;
+                       goto out;
 
-               r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
-               if (IS_ERR(r->r.fmr))
+               r->r.fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES *
+                                            sizeof(u64), GFP_KERNEL);
+               if (!r->r.fmr.physaddrs)
+                       goto out_free;
+
+               r->r.fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
+               if (IS_ERR(r->r.fmr.fmr))
                        goto out_fmr_err;
 
                list_add(&r->mw_list, &buf->rb_mws);
@@ -71,12 +95,24 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
        return 0;
 
 out_fmr_err:
-       rc = PTR_ERR(r->r.fmr);
+       rc = PTR_ERR(r->r.fmr.fmr);
        dprintk("RPC:       %s: ib_alloc_fmr status %i\n", __func__, rc);
+       kfree(r->r.fmr.physaddrs);
+out_free:
        kfree(r);
+out:
        return rc;
 }
 
+static int
+__fmr_unmap(struct rpcrdma_mw *r)
+{
+       LIST_HEAD(l);
+
+       list_add(&r->r.fmr.fmr->list, &l);
+       return ib_unmap_fmr(&l);
+}
+
 /* Use the ib_map_phys_fmr() verb to register a memory region
  * for remote access via RDMA READ or RDMA WRITE.
  */
@@ -85,12 +121,24 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
           int nsegs, bool writing)
 {
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct ib_device *device = ia->ri_id->device;
+       struct ib_device *device = ia->ri_device;
        enum dma_data_direction direction = rpcrdma_data_dir(writing);
        struct rpcrdma_mr_seg *seg1 = seg;
-       struct rpcrdma_mw *mw = seg1->rl_mw;
-       u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
        int len, pageoff, i, rc;
+       struct rpcrdma_mw *mw;
+
+       mw = seg1->rl_mw;
+       seg1->rl_mw = NULL;
+       if (!mw) {
+               mw = rpcrdma_get_mw(r_xprt);
+               if (!mw)
+                       return -ENOMEM;
+       } else {
+               /* this is a retransmit; generate a fresh rkey */
+               rc = __fmr_unmap(mw);
+               if (rc)
+                       return rc;
+       }
 
        pageoff = offset_in_page(seg1->mr_offset);
        seg1->mr_offset -= pageoff;     /* start of page */
@@ -100,7 +148,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
                nsegs = RPCRDMA_MAX_FMR_SGES;
        for (i = 0; i < nsegs;) {
                rpcrdma_map_one(device, seg, direction);
-               physaddrs[i] = seg->mr_dma;
+               mw->r.fmr.physaddrs[i] = seg->mr_dma;
                len += seg->mr_len;
                ++seg;
                ++i;
@@ -110,11 +158,13 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
                        break;
        }
 
-       rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma);
+       rc = ib_map_phys_fmr(mw->r.fmr.fmr, mw->r.fmr.physaddrs,
+                            i, seg1->mr_dma);
        if (rc)
                goto out_maperr;
 
-       seg1->mr_rkey = mw->r.fmr->rkey;
+       seg1->rl_mw = mw;
+       seg1->mr_rkey = mw->r.fmr.fmr->rkey;
        seg1->mr_base = seg1->mr_dma + pageoff;
        seg1->mr_nsegs = i;
        seg1->mr_len = len;
@@ -137,48 +187,28 @@ fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 {
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
        struct rpcrdma_mr_seg *seg1 = seg;
-       struct ib_device *device;
+       struct rpcrdma_mw *mw = seg1->rl_mw;
        int rc, nsegs = seg->mr_nsegs;
-       LIST_HEAD(l);
 
-       list_add(&seg1->rl_mw->r.fmr->list, &l);
-       rc = ib_unmap_fmr(&l);
-       read_lock(&ia->ri_qplock);
-       device = ia->ri_id->device;
+       dprintk("RPC:       %s: FMR %p\n", __func__, mw);
+
+       seg1->rl_mw = NULL;
        while (seg1->mr_nsegs--)
-               rpcrdma_unmap_one(device, seg++);
-       read_unlock(&ia->ri_qplock);
+               rpcrdma_unmap_one(ia->ri_device, seg++);
+       rc = __fmr_unmap(mw);
        if (rc)
                goto out_err;
+       rpcrdma_put_mw(r_xprt, mw);
        return nsegs;
 
 out_err:
+       /* The FMR is abandoned, but remains in rb_all. fmr_op_destroy
+        * will attempt to release it when the transport is destroyed.
+        */
        dprintk("RPC:       %s: ib_unmap_fmr status %i\n", __func__, rc);
        return nsegs;
 }
 
-/* After a disconnect, unmap all FMRs.
- *
- * This is invoked only in the transport connect worker in order
- * to serialize with rpcrdma_register_fmr_external().
- */
-static void
-fmr_op_reset(struct rpcrdma_xprt *r_xprt)
-{
-       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-       struct rpcrdma_mw *r;
-       LIST_HEAD(list);
-       int rc;
-
-       list_for_each_entry(r, &buf->rb_all, mw_all)
-               list_add(&r->r.fmr->list, &list);
-
-       rc = ib_unmap_fmr(&list);
-       if (rc)
-               dprintk("RPC:       %s: ib_unmap_fmr failed %i\n",
-                       __func__, rc);
-}
-
 static void
 fmr_op_destroy(struct rpcrdma_buffer *buf)
 {
@@ -188,10 +218,13 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
        while (!list_empty(&buf->rb_all)) {
                r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
                list_del(&r->mw_all);
-               rc = ib_dealloc_fmr(r->r.fmr);
+               kfree(r->r.fmr.physaddrs);
+
+               rc = ib_dealloc_fmr(r->r.fmr.fmr);
                if (rc)
                        dprintk("RPC:       %s: ib_dealloc_fmr failed %i\n",
                                __func__, rc);
+
                kfree(r);
        }
 }
@@ -202,7 +235,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
        .ro_open                        = fmr_op_open,
        .ro_maxpages                    = fmr_op_maxpages,
        .ro_init                        = fmr_op_init,
-       .ro_reset                       = fmr_op_reset,
        .ro_destroy                     = fmr_op_destroy,
        .ro_displayname                 = "fmr",
 };
index d234521320a4bb45a9bbecd40e5e58bbb42d77d8..04ea914201b237cc6f42ce68caa6b5dbc7b29d59 100644 (file)
  * but most complex memory registration mode.
  */
 
+/* Normal operation
+ *
+ * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG
+ * Work Request (frmr_op_map). When the RDMA operation is finished, this
+ * Memory Region is invalidated using a LOCAL_INV Work Request
+ * (frmr_op_unmap).
+ *
+ * Typically these Work Requests are not signaled, and neither are RDMA
+ * SEND Work Requests (with the exception of signaling occasionally to
+ * prevent provider work queue overflows). This greatly reduces HCA
+ * interrupt workload.
+ *
+ * As an optimization, frwr_op_unmap marks MRs INVALID before the
+ * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on
+ * rb_mws immediately so that no work (like managing a linked list
+ * under a spinlock) is needed in the completion upcall.
+ *
+ * But this means that frwr_op_map() can occasionally encounter an MR
+ * that is INVALID but the LOCAL_INV WR has not completed. Work Queue
+ * ordering prevents a subsequent FAST_REG WR from executing against
+ * that MR while it is still being invalidated.
+ */
+
+/* Transport recovery
+ *
+ * ->op_map and the transport connect worker cannot run at the same
+ * time, but ->op_unmap can fire while the transport connect worker
+ * is running. Thus MR recovery is handled in ->op_map, to guarantee
+ * that recovered MRs are owned by a sending RPC, and not one where
+ * ->op_unmap could fire at the same time transport reconnect is
+ * being done.
+ *
+ * When the underlying transport disconnects, MRs are left in one of
+ * three states:
+ *
+ * INVALID:    The MR was not in use before the QP entered ERROR state.
+ *             (Or, the LOCAL_INV WR has not completed or flushed yet).
+ *
+ * STALE:      The MR was being registered or unregistered when the QP
+ *             entered ERROR state, and the pending WR was flushed.
+ *
+ * VALID:      The MR was registered before the QP entered ERROR state.
+ *
+ * When frwr_op_map encounters STALE and VALID MRs, they are recovered
+ * with ib_dereg_mr and then are re-initialized. Beause MR recovery
+ * allocates fresh resources, it is deferred to a workqueue, and the
+ * recovered MRs are placed back on the rb_mws list when recovery is
+ * complete. frwr_op_map allocates another MR for the current RPC while
+ * the broken MR is reset.
+ *
+ * To ensure that frwr_op_map doesn't encounter an MR that is marked
+ * INVALID but that is about to be flushed due to a previous transport
+ * disconnect, the transport connect worker attempts to drain all
+ * pending send queue WRs before the transport is reconnected.
+ */
+
 #include "xprt_rdma.h"
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY       RPCDBG_TRANS
 #endif
 
+static struct workqueue_struct *frwr_recovery_wq;
+
+#define FRWR_RECOVERY_WQ_FLAGS         (WQ_UNBOUND | WQ_MEM_RECLAIM)
+
+int
+frwr_alloc_recovery_wq(void)
+{
+       frwr_recovery_wq = alloc_workqueue("frwr_recovery",
+                                          FRWR_RECOVERY_WQ_FLAGS, 0);
+       return !frwr_recovery_wq ? -ENOMEM : 0;
+}
+
+void
+frwr_destroy_recovery_wq(void)
+{
+       struct workqueue_struct *wq;
+
+       if (!frwr_recovery_wq)
+               return;
+
+       wq = frwr_recovery_wq;
+       frwr_recovery_wq = NULL;
+       destroy_workqueue(wq);
+}
+
+/* Deferred reset of a single FRMR. Generate a fresh rkey by
+ * replacing the MR.
+ *
+ * There's no recovery if this fails. The FRMR is abandoned, but
+ * remains in rb_all. It will be cleaned up when the transport is
+ * destroyed.
+ */
+static void
+__frwr_recovery_worker(struct work_struct *work)
+{
+       struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
+                                           r.frmr.fr_work);
+       struct rpcrdma_xprt *r_xprt = r->r.frmr.fr_xprt;
+       unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
+       struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+
+       if (ib_dereg_mr(r->r.frmr.fr_mr))
+               goto out_fail;
+
+       r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(pd, depth);
+       if (IS_ERR(r->r.frmr.fr_mr))
+               goto out_fail;
+
+       dprintk("RPC:       %s: recovered FRMR %p\n", __func__, r);
+       r->r.frmr.fr_state = FRMR_IS_INVALID;
+       rpcrdma_put_mw(r_xprt, r);
+       return;
+
+out_fail:
+       pr_warn("RPC:       %s: FRMR %p unrecovered\n",
+               __func__, r);
+}
+
+/* A broken MR was discovered in a context that can't sleep.
+ * Defer recovery to the recovery worker.
+ */
+static void
+__frwr_queue_recovery(struct rpcrdma_mw *r)
+{
+       INIT_WORK(&r->r.frmr.fr_work, __frwr_recovery_worker);
+       queue_work(frwr_recovery_wq, &r->r.frmr.fr_work);
+}
+
 static int
 __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
            unsigned int depth)
@@ -128,7 +252,7 @@ frwr_sendcompletion(struct ib_wc *wc)
 
        /* WARNING: Only wr_id and status are reliable at this point */
        r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
-       dprintk("RPC:       %s: frmr %p (stale), status %s (%d)\n",
+       pr_warn("RPC:       %s: frmr %p flushed, status %s (%d)\n",
                __func__, r, ib_wc_status_msg(wc->status), wc->status);
        r->r.frmr.fr_state = FRMR_IS_STALE;
 }
@@ -137,16 +261,19 @@ static int
 frwr_op_init(struct rpcrdma_xprt *r_xprt)
 {
        struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-       struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+       struct ib_device *device = r_xprt->rx_ia.ri_device;
        unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
        struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
        int i;
 
+       spin_lock_init(&buf->rb_mwlock);
        INIT_LIST_HEAD(&buf->rb_mws);
        INIT_LIST_HEAD(&buf->rb_all);
 
-       i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
-       dprintk("RPC:       %s: initializing %d FRMRs\n", __func__, i);
+       i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1);
+       i += 2;                         /* head + tail */
+       i *= buf->rb_max_requests;      /* one set for each RPC slot */
+       dprintk("RPC:       %s: initalizing %d FRMRs\n", __func__, i);
 
        while (i--) {
                struct rpcrdma_mw *r;
@@ -165,6 +292,7 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
                list_add(&r->mw_list, &buf->rb_mws);
                list_add(&r->mw_all, &buf->rb_all);
                r->mw_sendcompletion = frwr_sendcompletion;
+               r->r.frmr.fr_xprt = r_xprt;
        }
 
        return 0;
@@ -178,12 +306,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
            int nsegs, bool writing)
 {
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct ib_device *device = ia->ri_id->device;
+       struct ib_device *device = ia->ri_device;
        enum dma_data_direction direction = rpcrdma_data_dir(writing);
        struct rpcrdma_mr_seg *seg1 = seg;
-       struct rpcrdma_mw *mw = seg1->rl_mw;
-       struct rpcrdma_frmr *frmr = &mw->r.frmr;
-       struct ib_mr *mr = frmr->fr_mr;
+       struct rpcrdma_mw *mw;
+       struct rpcrdma_frmr *frmr;
+       struct ib_mr *mr;
        struct ib_send_wr fastreg_wr, *bad_wr;
        u8 key;
        int len, pageoff;
@@ -192,12 +320,25 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        u64 pa;
        int page_no;
 
+       mw = seg1->rl_mw;
+       seg1->rl_mw = NULL;
+       do {
+               if (mw)
+                       __frwr_queue_recovery(mw);
+               mw = rpcrdma_get_mw(r_xprt);
+               if (!mw)
+                       return -ENOMEM;
+       } while (mw->r.frmr.fr_state != FRMR_IS_INVALID);
+       frmr = &mw->r.frmr;
+       frmr->fr_state = FRMR_IS_VALID;
+
        pageoff = offset_in_page(seg1->mr_offset);
        seg1->mr_offset -= pageoff;     /* start of page */
        seg1->mr_len += pageoff;
        len = -pageoff;
        if (nsegs > ia->ri_max_frmr_depth)
                nsegs = ia->ri_max_frmr_depth;
+
        for (page_no = i = 0; i < nsegs;) {
                rpcrdma_map_one(device, seg, direction);
                pa = seg->mr_dma;
@@ -216,8 +357,6 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        dprintk("RPC:       %s: Using frmr %p to map %d segments (%d bytes)\n",
                __func__, mw, i, len);
 
-       frmr->fr_state = FRMR_IS_VALID;
-
        memset(&fastreg_wr, 0, sizeof(fastreg_wr));
        fastreg_wr.wr_id = (unsigned long)(void *)mw;
        fastreg_wr.opcode = IB_WR_FAST_REG_MR;
@@ -229,6 +368,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        fastreg_wr.wr.fast_reg.access_flags = writing ?
                                IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
                                IB_ACCESS_REMOTE_READ;
+       mr = frmr->fr_mr;
        key = (u8)(mr->rkey & 0x000000FF);
        ib_update_fast_reg_key(mr, ++key);
        fastreg_wr.wr.fast_reg.rkey = mr->rkey;
@@ -238,6 +378,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        if (rc)
                goto out_senderr;
 
+       seg1->rl_mw = mw;
        seg1->mr_rkey = mr->rkey;
        seg1->mr_base = seg1->mr_dma + pageoff;
        seg1->mr_nsegs = i;
@@ -246,10 +387,9 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 
 out_senderr:
        dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
-       ib_update_fast_reg_key(mr, --key);
-       frmr->fr_state = FRMR_IS_INVALID;
        while (i--)
                rpcrdma_unmap_one(device, --seg);
+       __frwr_queue_recovery(mw);
        return rc;
 }
 
@@ -261,78 +401,46 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 {
        struct rpcrdma_mr_seg *seg1 = seg;
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+       struct rpcrdma_mw *mw = seg1->rl_mw;
        struct ib_send_wr invalidate_wr, *bad_wr;
        int rc, nsegs = seg->mr_nsegs;
-       struct ib_device *device;
 
-       seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
+       dprintk("RPC:       %s: FRMR %p\n", __func__, mw);
+
+       seg1->rl_mw = NULL;
+       mw->r.frmr.fr_state = FRMR_IS_INVALID;
 
        memset(&invalidate_wr, 0, sizeof(invalidate_wr));
-       invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
+       invalidate_wr.wr_id = (unsigned long)(void *)mw;
        invalidate_wr.opcode = IB_WR_LOCAL_INV;
-       invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
+       invalidate_wr.ex.invalidate_rkey = mw->r.frmr.fr_mr->rkey;
        DECR_CQCOUNT(&r_xprt->rx_ep);
 
-       read_lock(&ia->ri_qplock);
-       device = ia->ri_id->device;
        while (seg1->mr_nsegs--)
-               rpcrdma_unmap_one(device, seg++);
+               rpcrdma_unmap_one(ia->ri_device, seg++);
+       read_lock(&ia->ri_qplock);
        rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
        read_unlock(&ia->ri_qplock);
        if (rc)
                goto out_err;
+
+       rpcrdma_put_mw(r_xprt, mw);
        return nsegs;
 
 out_err:
-       /* Force rpcrdma_buffer_get() to retry */
-       seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
        dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
+       __frwr_queue_recovery(mw);
        return nsegs;
 }
 
-/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
- * an unusable state. Find FRMRs in this state and dereg / reg
- * each.  FRMRs that are VALID and attached to an rpcrdma_req are
- * also torn down.
- *
- * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
- *
- * This is invoked only in the transport connect worker in order
- * to serialize with rpcrdma_register_frmr_external().
- */
-static void
-frwr_op_reset(struct rpcrdma_xprt *r_xprt)
-{
-       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-       struct ib_device *device = r_xprt->rx_ia.ri_id->device;
-       unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
-       struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
-       struct rpcrdma_mw *r;
-       int rc;
-
-       list_for_each_entry(r, &buf->rb_all, mw_all) {
-               if (r->r.frmr.fr_state == FRMR_IS_INVALID)
-                       continue;
-
-               __frwr_release(r);
-               rc = __frwr_init(r, pd, device, depth);
-               if (rc) {
-                       dprintk("RPC:       %s: mw %p left %s\n",
-                               __func__, r,
-                               (r->r.frmr.fr_state == FRMR_IS_STALE ?
-                                       "stale" : "valid"));
-                       continue;
-               }
-
-               r->r.frmr.fr_state = FRMR_IS_INVALID;
-       }
-}
-
 static void
 frwr_op_destroy(struct rpcrdma_buffer *buf)
 {
        struct rpcrdma_mw *r;
 
+       /* Ensure stale MWs for "buf" are no longer in flight */
+       flush_workqueue(frwr_recovery_wq);
+
        while (!list_empty(&buf->rb_all)) {
                r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
                list_del(&r->mw_all);
@@ -347,7 +455,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
        .ro_open                        = frwr_op_open,
        .ro_maxpages                    = frwr_op_maxpages,
        .ro_init                        = frwr_op_init,
-       .ro_reset                       = frwr_op_reset,
        .ro_destroy                     = frwr_op_destroy,
        .ro_displayname                 = "frwr",
 };
index ba518af167873dfe2e9c1f5f6723665bda2bd2e7..41985d07fdb744b5d9523b7c34af93c30f70522d 100644 (file)
@@ -50,8 +50,7 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 {
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
-       rpcrdma_map_one(ia->ri_id->device, seg,
-                       rpcrdma_data_dir(writing));
+       rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
        seg->mr_rkey = ia->ri_bind_mem->rkey;
        seg->mr_base = seg->mr_dma;
        seg->mr_nsegs = 1;
@@ -65,18 +64,10 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 {
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
-       read_lock(&ia->ri_qplock);
-       rpcrdma_unmap_one(ia->ri_id->device, seg);
-       read_unlock(&ia->ri_qplock);
-
+       rpcrdma_unmap_one(ia->ri_device, seg);
        return 1;
 }
 
-static void
-physical_op_reset(struct rpcrdma_xprt *r_xprt)
-{
-}
-
 static void
 physical_op_destroy(struct rpcrdma_buffer *buf)
 {
@@ -88,7 +79,6 @@ const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
        .ro_open                        = physical_op_open,
        .ro_maxpages                    = physical_op_maxpages,
        .ro_init                        = physical_op_init,
-       .ro_reset                       = physical_op_reset,
        .ro_destroy                     = physical_op_destroy,
        .ro_displayname                 = "physical",
 };
index 2c53ea9e1b83dae01ebdd1aa22d256174dfbae08..84ea37daef36b0aa885c27e5eda950dda818949a 100644 (file)
@@ -284,9 +284,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
        return (unsigned char *)iptr - (unsigned char *)headerp;
 
 out:
-       if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
-               return n;
-
        for (pos = 0; nchunks--;)
                pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
                                                      &req->rl_segments[pos]);
@@ -732,8 +729,8 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
        struct rpcrdma_msg *headerp;
        struct rpcrdma_req *req;
        struct rpc_rqst *rqst;
-       struct rpc_xprt *xprt = rep->rr_xprt;
-       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+       struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+       struct rpc_xprt *xprt = &r_xprt->rx_xprt;
        __be32 *iptr;
        int rdmalen, status;
        unsigned long cwnd;
@@ -770,7 +767,6 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
                        rep->rr_len);
 repost:
                r_xprt->rx_stats.bad_reply_count++;
-               rep->rr_func = rpcrdma_reply_handler;
                if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
                        rpcrdma_recv_buffer_put(rep);
 
index 436da2caec955ded2b2f4f6a2056cb1191594373..680f888a9ddd045314b305ef772385c7c6d5624e 100644 (file)
@@ -240,6 +240,16 @@ xprt_rdma_connect_worker(struct work_struct *work)
        xprt_clear_connecting(xprt);
 }
 
+static void
+xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
+{
+       struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt,
+                                                  rx_xprt);
+
+       pr_info("rpcrdma: injecting transport disconnect on xprt=%p\n", xprt);
+       rdma_disconnect(r_xprt->rx_ia.ri_id);
+}
+
 /*
  * xprt_rdma_destroy
  *
@@ -612,12 +622,6 @@ xprt_rdma_send_request(struct rpc_task *task)
        if (req->rl_reply == NULL)              /* e.g. reconnection */
                rpcrdma_recv_buffer_get(req);
 
-       if (req->rl_reply) {
-               req->rl_reply->rr_func = rpcrdma_reply_handler;
-               /* this need only be done once, but... */
-               req->rl_reply->rr_xprt = xprt;
-       }
-
        /* Must suppress retransmit to maintain credits */
        if (req->rl_connect_cookie == xprt->connect_cookie)
                goto drop_connection;
@@ -676,6 +680,17 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
           r_xprt->rx_stats.bad_reply_count);
 }
 
+static int
+xprt_rdma_enable_swap(struct rpc_xprt *xprt)
+{
+       return -EINVAL;
+}
+
+static void
+xprt_rdma_disable_swap(struct rpc_xprt *xprt)
+{
+}
+
 /*
  * Plumbing for rpc transport switch and kernel module
  */
@@ -694,7 +709,10 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
        .send_request           = xprt_rdma_send_request,
        .close                  = xprt_rdma_close,
        .destroy                = xprt_rdma_destroy,
-       .print_stats            = xprt_rdma_print_stats
+       .print_stats            = xprt_rdma_print_stats,
+       .enable_swap            = xprt_rdma_enable_swap,
+       .disable_swap           = xprt_rdma_disable_swap,
+       .inject_disconnect      = xprt_rdma_inject_disconnect
 };
 
 static struct xprt_class xprt_rdma = {
@@ -720,17 +738,24 @@ void xprt_rdma_cleanup(void)
        if (rc)
                dprintk("RPC:       %s: xprt_unregister returned %i\n",
                        __func__, rc);
+
+       frwr_destroy_recovery_wq();
 }
 
 int xprt_rdma_init(void)
 {
        int rc;
 
-       rc = xprt_register_transport(&xprt_rdma);
-
+       rc = frwr_alloc_recovery_wq();
        if (rc)
                return rc;
 
+       rc = xprt_register_transport(&xprt_rdma);
+       if (rc) {
+               frwr_destroy_recovery_wq();
+               return rc;
+       }
+
        dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
 
        dprintk("Defaults:\n");
index 52df265b472a9b2b79574c7d9363acba26ea5d8b..891c4ede2c20ea8d8c6bc79ee080f353d4df13d7 100644 (file)
@@ -80,7 +80,6 @@ static void
 rpcrdma_run_tasklet(unsigned long data)
 {
        struct rpcrdma_rep *rep;
-       void (*func)(struct rpcrdma_rep *);
        unsigned long flags;
 
        data = data;
@@ -89,14 +88,9 @@ rpcrdma_run_tasklet(unsigned long data)
                rep = list_entry(rpcrdma_tasklets_g.next,
                                 struct rpcrdma_rep, rr_list);
                list_del(&rep->rr_list);
-               func = rep->rr_func;
-               rep->rr_func = NULL;
                spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
 
-               if (func)
-                       func(rep);
-               else
-                       rpcrdma_recv_buffer_put(rep);
+               rpcrdma_reply_handler(rep);
 
                spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
        }
@@ -236,7 +230,7 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
                __func__, rep, wc->byte_len);
 
        rep->rr_len = wc->byte_len;
-       ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
+       ib_dma_sync_single_for_cpu(rep->rr_device,
                                   rdmab_addr(rep->rr_rdmabuf),
                                   rep->rr_len, DMA_FROM_DEVICE);
        prefetch(rdmab_to_msg(rep->rr_rdmabuf));
@@ -407,7 +401,7 @@ connected:
 
                pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
                        sap, rpc_get_port(sap),
-                       ia->ri_id->device->name,
+                       ia->ri_device->name,
                        ia->ri_ops->ro_displayname,
                        xprt->rx_buf.rb_max_requests,
                        ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
@@ -508,8 +502,9 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
                rc = PTR_ERR(ia->ri_id);
                goto out1;
        }
+       ia->ri_device = ia->ri_id->device;
 
-       ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
+       ia->ri_pd = ib_alloc_pd(ia->ri_device);
        if (IS_ERR(ia->ri_pd)) {
                rc = PTR_ERR(ia->ri_pd);
                dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
@@ -517,7 +512,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
                goto out2;
        }
 
-       rc = ib_query_device(ia->ri_id->device, devattr);
+       rc = ib_query_device(ia->ri_device, devattr);
        if (rc) {
                dprintk("RPC:       %s: ib_query_device failed %d\n",
                        __func__, rc);
@@ -526,7 +521,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 
        if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
                ia->ri_have_dma_lkey = 1;
-               ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
+               ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
        }
 
        if (memreg == RPCRDMA_FRMR) {
@@ -541,7 +536,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
                }
        }
        if (memreg == RPCRDMA_MTHCAFMR) {
-               if (!ia->ri_id->device->alloc_fmr) {
+               if (!ia->ri_device->alloc_fmr) {
                        dprintk("RPC:       %s: MTHCAFMR registration "
                                "not supported by HCA\n", __func__);
                        memreg = RPCRDMA_ALLPHYSICAL;
@@ -590,9 +585,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
        dprintk("RPC:       %s: memory registration strategy is '%s'\n",
                __func__, ia->ri_ops->ro_displayname);
 
-       /* Else will do memory reg/dereg for each chunk */
-       ia->ri_memreg_strategy = memreg;
-
        rwlock_init(&ia->ri_qplock);
        return 0;
 
@@ -622,17 +614,17 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
                dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
                        __func__, rc);
        }
+
        if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
                if (ia->ri_id->qp)
                        rdma_destroy_qp(ia->ri_id);
                rdma_destroy_id(ia->ri_id);
                ia->ri_id = NULL;
        }
-       if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
-               rc = ib_dealloc_pd(ia->ri_pd);
-               dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
-                       __func__, rc);
-       }
+
+       /* If the pd is still busy, xprtrdma missed freeing a resource */
+       if (ia->ri_pd && !IS_ERR(ia->ri_pd))
+               WARN_ON(ib_dealloc_pd(ia->ri_pd));
 }
 
 /*
@@ -693,8 +685,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
 
        cq_attr.cqe = ep->rep_attr.cap.max_send_wr + 1;
-       sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
-                                 rpcrdma_cq_async_error_upcall, ep, &cq_attr);
+       sendcq = ib_create_cq(ia->ri_device, rpcrdma_sendcq_upcall,
+                             rpcrdma_cq_async_error_upcall, ep, &cq_attr);
        if (IS_ERR(sendcq)) {
                rc = PTR_ERR(sendcq);
                dprintk("RPC:       %s: failed to create send CQ: %i\n",
@@ -710,8 +702,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        }
 
        cq_attr.cqe = ep->rep_attr.cap.max_recv_wr + 1;
-       recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
-                                 rpcrdma_cq_async_error_upcall, ep, &cq_attr);
+       recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall,
+                             rpcrdma_cq_async_error_upcall, ep, &cq_attr);
        if (IS_ERR(recvcq)) {
                rc = PTR_ERR(recvcq);
                dprintk("RPC:       %s: failed to create recv CQ: %i\n",
@@ -817,8 +809,6 @@ retry:
                rpcrdma_flush_cqs(ep);
 
                xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
-               ia->ri_ops->ro_reset(xprt);
-
                id = rpcrdma_create_id(xprt, ia,
                                (struct sockaddr *)&xprt->rx_data.addr);
                if (IS_ERR(id)) {
@@ -832,7 +822,7 @@ retry:
                 * More stuff I haven't thought of!
                 * Rrrgh!
                 */
-               if (ia->ri_id->device != id->device) {
+               if (ia->ri_device != id->device) {
                        printk("RPC:       %s: can't reconnect on "
                                "different device!\n", __func__);
                        rdma_destroy_id(id);
@@ -974,7 +964,8 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
                goto out_free;
        }
 
-       rep->rr_buffer = &r_xprt->rx_buf;
+       rep->rr_device = ia->ri_device;
+       rep->rr_rxprt = r_xprt;
        return rep;
 
 out_free:
@@ -1098,31 +1089,33 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
        kfree(buf->rb_pool);
 }
 
-/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
- * some req segments uninitialized.
- */
-static void
-rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
+struct rpcrdma_mw *
+rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
 {
-       if (*mw) {
-               list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
-               *mw = NULL;
+       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+       struct rpcrdma_mw *mw = NULL;
+
+       spin_lock(&buf->rb_mwlock);
+       if (!list_empty(&buf->rb_mws)) {
+               mw = list_first_entry(&buf->rb_mws,
+                                     struct rpcrdma_mw, mw_list);
+               list_del_init(&mw->mw_list);
        }
+       spin_unlock(&buf->rb_mwlock);
+
+       if (!mw)
+               pr_err("RPC:       %s: no MWs available\n", __func__);
+       return mw;
 }
 
-/* Cycle mw's back in reverse order, and "spin" them.
- * This delays and scrambles reuse as much as possible.
- */
-static void
-rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+void
+rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
 {
-       struct rpcrdma_mr_seg *seg = req->rl_segments;
-       struct rpcrdma_mr_seg *seg1 = seg;
-       int i;
+       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 
-       for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
-               rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
-       rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
+       spin_lock(&buf->rb_mwlock);
+       list_add_tail(&mw->mw_list, &buf->rb_mws);
+       spin_unlock(&buf->rb_mwlock);
 }
 
 static void
@@ -1132,115 +1125,10 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
        req->rl_niovs = 0;
        if (req->rl_reply) {
                buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
-               req->rl_reply->rr_func = NULL;
                req->rl_reply = NULL;
        }
 }
 
-/* rpcrdma_unmap_one() was already done during deregistration.
- * Redo only the ib_post_send().
- */
-static void
-rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
-{
-       struct rpcrdma_xprt *r_xprt =
-                               container_of(ia, struct rpcrdma_xprt, rx_ia);
-       struct ib_send_wr invalidate_wr, *bad_wr;
-       int rc;
-
-       dprintk("RPC:       %s: FRMR %p is stale\n", __func__, r);
-
-       /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
-       r->r.frmr.fr_state = FRMR_IS_INVALID;
-
-       memset(&invalidate_wr, 0, sizeof(invalidate_wr));
-       invalidate_wr.wr_id = (unsigned long)(void *)r;
-       invalidate_wr.opcode = IB_WR_LOCAL_INV;
-       invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
-       DECR_CQCOUNT(&r_xprt->rx_ep);
-
-       dprintk("RPC:       %s: frmr %p invalidating rkey %08x\n",
-               __func__, r, r->r.frmr.fr_mr->rkey);
-
-       read_lock(&ia->ri_qplock);
-       rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
-       read_unlock(&ia->ri_qplock);
-       if (rc) {
-               /* Force rpcrdma_buffer_get() to retry */
-               r->r.frmr.fr_state = FRMR_IS_STALE;
-               dprintk("RPC:       %s: ib_post_send failed, %i\n",
-                       __func__, rc);
-       }
-}
-
-static void
-rpcrdma_retry_flushed_linv(struct list_head *stale,
-                          struct rpcrdma_buffer *buf)
-{
-       struct rpcrdma_ia *ia = rdmab_to_ia(buf);
-       struct list_head *pos;
-       struct rpcrdma_mw *r;
-       unsigned long flags;
-
-       list_for_each(pos, stale) {
-               r = list_entry(pos, struct rpcrdma_mw, mw_list);
-               rpcrdma_retry_local_inv(r, ia);
-       }
-
-       spin_lock_irqsave(&buf->rb_lock, flags);
-       list_splice_tail(stale, &buf->rb_mws);
-       spin_unlock_irqrestore(&buf->rb_lock, flags);
-}
-
-static struct rpcrdma_req *
-rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
-                        struct list_head *stale)
-{
-       struct rpcrdma_mw *r;
-       int i;
-
-       i = RPCRDMA_MAX_SEGS - 1;
-       while (!list_empty(&buf->rb_mws)) {
-               r = list_entry(buf->rb_mws.next,
-                              struct rpcrdma_mw, mw_list);
-               list_del(&r->mw_list);
-               if (r->r.frmr.fr_state == FRMR_IS_STALE) {
-                       list_add(&r->mw_list, stale);
-                       continue;
-               }
-               req->rl_segments[i].rl_mw = r;
-               if (unlikely(i-- == 0))
-                       return req;     /* Success */
-       }
-
-       /* Not enough entries on rb_mws for this req */
-       rpcrdma_buffer_put_sendbuf(req, buf);
-       rpcrdma_buffer_put_mrs(req, buf);
-       return NULL;
-}
-
-static struct rpcrdma_req *
-rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
-{
-       struct rpcrdma_mw *r;
-       int i;
-
-       i = RPCRDMA_MAX_SEGS - 1;
-       while (!list_empty(&buf->rb_mws)) {
-               r = list_entry(buf->rb_mws.next,
-                              struct rpcrdma_mw, mw_list);
-               list_del(&r->mw_list);
-               req->rl_segments[i].rl_mw = r;
-               if (unlikely(i-- == 0))
-                       return req;     /* Success */
-       }
-
-       /* Not enough entries on rb_mws for this req */
-       rpcrdma_buffer_put_sendbuf(req, buf);
-       rpcrdma_buffer_put_mrs(req, buf);
-       return NULL;
-}
-
 /*
  * Get a set of request/reply buffers.
  *
@@ -1253,12 +1141,11 @@ rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
 struct rpcrdma_req *
 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 {
-       struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
-       struct list_head stale;
        struct rpcrdma_req *req;
        unsigned long flags;
 
        spin_lock_irqsave(&buffers->rb_lock, flags);
+
        if (buffers->rb_send_index == buffers->rb_max_requests) {
                spin_unlock_irqrestore(&buffers->rb_lock, flags);
                dprintk("RPC:       %s: out of request buffers\n", __func__);
@@ -1277,20 +1164,7 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
        }
        buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
 
-       INIT_LIST_HEAD(&stale);
-       switch (ia->ri_memreg_strategy) {
-       case RPCRDMA_FRMR:
-               req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
-               break;
-       case RPCRDMA_MTHCAFMR:
-               req = rpcrdma_buffer_get_fmrs(req, buffers);
-               break;
-       default:
-               break;
-       }
        spin_unlock_irqrestore(&buffers->rb_lock, flags);
-       if (!list_empty(&stale))
-               rpcrdma_retry_flushed_linv(&stale, buffers);
        return req;
 }
 
@@ -1302,19 +1176,10 @@ void
 rpcrdma_buffer_put(struct rpcrdma_req *req)
 {
        struct rpcrdma_buffer *buffers = req->rl_buffer;
-       struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
        unsigned long flags;
 
        spin_lock_irqsave(&buffers->rb_lock, flags);
        rpcrdma_buffer_put_sendbuf(req, buffers);
-       switch (ia->ri_memreg_strategy) {
-       case RPCRDMA_FRMR:
-       case RPCRDMA_MTHCAFMR:
-               rpcrdma_buffer_put_mrs(req, buffers);
-               break;
-       default:
-               break;
-       }
        spin_unlock_irqrestore(&buffers->rb_lock, flags);
 }
 
@@ -1344,10 +1209,9 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
 void
 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
 {
-       struct rpcrdma_buffer *buffers = rep->rr_buffer;
+       struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
        unsigned long flags;
 
-       rep->rr_func = NULL;
        spin_lock_irqsave(&buffers->rb_lock, flags);
        buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
        spin_unlock_irqrestore(&buffers->rb_lock, flags);
@@ -1376,9 +1240,9 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
        /*
         * All memory passed here was kmalloc'ed, therefore phys-contiguous.
         */
-       iov->addr = ib_dma_map_single(ia->ri_id->device,
+       iov->addr = ib_dma_map_single(ia->ri_device,
                        va, len, DMA_BIDIRECTIONAL);
-       if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
+       if (ib_dma_mapping_error(ia->ri_device, iov->addr))
                return -ENOMEM;
 
        iov->length = len;
@@ -1422,8 +1286,8 @@ rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
 {
        int rc;
 
-       ib_dma_unmap_single(ia->ri_id->device,
-                       iov->addr, iov->length, DMA_BIDIRECTIONAL);
+       ib_dma_unmap_single(ia->ri_device,
+                           iov->addr, iov->length, DMA_BIDIRECTIONAL);
 
        if (NULL == mr)
                return 0;
@@ -1516,15 +1380,18 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
        send_wr.num_sge = req->rl_niovs;
        send_wr.opcode = IB_WR_SEND;
        if (send_wr.num_sge == 4)       /* no need to sync any pad (constant) */
-               ib_dma_sync_single_for_device(ia->ri_id->device,
-                       req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
-                       DMA_TO_DEVICE);
-       ib_dma_sync_single_for_device(ia->ri_id->device,
-               req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
-               DMA_TO_DEVICE);
-       ib_dma_sync_single_for_device(ia->ri_id->device,
-               req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
-               DMA_TO_DEVICE);
+               ib_dma_sync_single_for_device(ia->ri_device,
+                                             req->rl_send_iov[3].addr,
+                                             req->rl_send_iov[3].length,
+                                             DMA_TO_DEVICE);
+       ib_dma_sync_single_for_device(ia->ri_device,
+                                     req->rl_send_iov[1].addr,
+                                     req->rl_send_iov[1].length,
+                                     DMA_TO_DEVICE);
+       ib_dma_sync_single_for_device(ia->ri_device,
+                                     req->rl_send_iov[0].addr,
+                                     req->rl_send_iov[0].length,
+                                     DMA_TO_DEVICE);
 
        if (DECR_CQCOUNT(ep) > 0)
                send_wr.send_flags = 0;
@@ -1557,7 +1424,7 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
        recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
        recv_wr.num_sge = 1;
 
-       ib_dma_sync_single_for_cpu(ia->ri_id->device,
+       ib_dma_sync_single_for_cpu(ia->ri_device,
                                   rdmab_addr(rep->rr_rdmabuf),
                                   rdmab_length(rep->rr_rdmabuf),
                                   DMA_BIDIRECTIONAL);
index 58163b88738c2363c559be28d3f3807043065826..f49dd8b381221dceaef4847e4ae28d397ebcdf27 100644 (file)
@@ -62,6 +62,7 @@
 struct rpcrdma_ia {
        const struct rpcrdma_memreg_ops *ri_ops;
        rwlock_t                ri_qplock;
+       struct ib_device        *ri_device;
        struct rdma_cm_id       *ri_id;
        struct ib_pd            *ri_pd;
        struct ib_mr            *ri_bind_mem;
@@ -69,7 +70,6 @@ struct rpcrdma_ia {
        int                     ri_have_dma_lkey;
        struct completion       ri_done;
        int                     ri_async_rc;
-       enum rpcrdma_memreg     ri_memreg_strategy;
        unsigned int            ri_max_frmr_depth;
        struct ib_device_attr   ri_devattr;
        struct ib_qp_attr       ri_qp_attr;
@@ -173,9 +173,8 @@ struct rpcrdma_buffer;
 
 struct rpcrdma_rep {
        unsigned int            rr_len;
-       struct rpcrdma_buffer   *rr_buffer;
-       struct rpc_xprt         *rr_xprt;
-       void                    (*rr_func)(struct rpcrdma_rep *);
+       struct ib_device        *rr_device;
+       struct rpcrdma_xprt     *rr_rxprt;
        struct list_head        rr_list;
        struct rpcrdma_regbuf   *rr_rdmabuf;
 };
@@ -203,11 +202,18 @@ struct rpcrdma_frmr {
        struct ib_fast_reg_page_list    *fr_pgl;
        struct ib_mr                    *fr_mr;
        enum rpcrdma_frmr_state         fr_state;
+       struct work_struct              fr_work;
+       struct rpcrdma_xprt             *fr_xprt;
+};
+
+struct rpcrdma_fmr {
+       struct ib_fmr           *fmr;
+       u64                     *physaddrs;
 };
 
 struct rpcrdma_mw {
        union {
-               struct ib_fmr           *fmr;
+               struct rpcrdma_fmr      fmr;
                struct rpcrdma_frmr     frmr;
        } r;
        void                    (*mw_sendcompletion)(struct ib_wc *);
@@ -281,15 +287,17 @@ rpcr_to_rdmar(struct rpc_rqst *rqst)
  * One of these is associated with a transport instance
  */
 struct rpcrdma_buffer {
-       spinlock_t      rb_lock;        /* protects indexes */
-       u32             rb_max_requests;/* client max requests */
-       struct list_head rb_mws;        /* optional memory windows/fmrs/frmrs */
-       struct list_head rb_all;
-       int             rb_send_index;
+       spinlock_t              rb_mwlock;      /* protect rb_mws list */
+       struct list_head        rb_mws;
+       struct list_head        rb_all;
+       char                    *rb_pool;
+
+       spinlock_t              rb_lock;        /* protect buf arrays */
+       u32                     rb_max_requests;
+       int                     rb_send_index;
+       int                     rb_recv_index;
        struct rpcrdma_req      **rb_send_bufs;
-       int             rb_recv_index;
        struct rpcrdma_rep      **rb_recv_bufs;
-       char            *rb_pool;
 };
 #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
 
@@ -350,7 +358,6 @@ struct rpcrdma_memreg_ops {
                                   struct rpcrdma_create_data_internal *);
        size_t          (*ro_maxpages)(struct rpcrdma_xprt *);
        int             (*ro_init)(struct rpcrdma_xprt *);
-       void            (*ro_reset)(struct rpcrdma_xprt *);
        void            (*ro_destroy)(struct rpcrdma_buffer *);
        const char      *ro_displayname;
 };
@@ -413,6 +420,8 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
 int rpcrdma_buffer_create(struct rpcrdma_xprt *);
 void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
 
+struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *);
+void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *);
 struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
 void rpcrdma_buffer_put(struct rpcrdma_req *);
 void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
@@ -425,6 +434,9 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
 
 unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
 
+int frwr_alloc_recovery_wq(void);
+void frwr_destroy_recovery_wq(void);
+
 /*
  * Wrappers for chunk registration, shared by read/write chunk code.
  */
index b0517287075b2753101bbde70262c99c2a58f35b..e193c2b5476b3a83973e9799e2e826fdcd2b842c 100644 (file)
@@ -622,24 +622,6 @@ process_status:
        return status;
 }
 
-/**
- * xs_tcp_shutdown - gracefully shut down a TCP socket
- * @xprt: transport
- *
- * Initiates a graceful shutdown of the TCP socket by calling the
- * equivalent of shutdown(SHUT_RDWR);
- */
-static void xs_tcp_shutdown(struct rpc_xprt *xprt)
-{
-       struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
-       struct socket *sock = transport->sock;
-
-       if (sock != NULL) {
-               kernel_sock_shutdown(sock, SHUT_RDWR);
-               trace_rpc_socket_shutdown(xprt, sock);
-       }
-}
-
 /**
  * xs_tcp_send_request - write an RPC request to a TCP socket
  * @task: address of RPC task that manages the state of an RPC request
@@ -786,6 +768,7 @@ static void xs_sock_mark_closed(struct rpc_xprt *xprt)
        xs_sock_reset_connection_flags(xprt);
        /* Mark transport as closed and wake up all pending tasks */
        xprt_disconnect_done(xprt);
+       xprt_force_disconnect(xprt);
 }
 
 /**
@@ -827,6 +810,9 @@ static void xs_reset_transport(struct sock_xprt *transport)
        if (sk == NULL)
                return;
 
+       if (atomic_read(&transport->xprt.swapper))
+               sk_clear_memalloc(sk);
+
        write_lock_bh(&sk->sk_callback_lock);
        transport->inet = NULL;
        transport->sock = NULL;
@@ -863,6 +849,13 @@ static void xs_close(struct rpc_xprt *xprt)
        xprt_disconnect_done(xprt);
 }
 
+static void xs_inject_disconnect(struct rpc_xprt *xprt)
+{
+       dprintk("RPC:       injecting transport disconnect on xprt=%p\n",
+               xprt);
+       xprt_disconnect_done(xprt);
+}
+
 static void xs_xprt_free(struct rpc_xprt *xprt)
 {
        xs_free_peer_addresses(xprt);
@@ -901,7 +894,6 @@ static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
 /**
  * xs_local_data_ready - "data ready" callback for AF_LOCAL sockets
  * @sk: socket with data to read
- * @len: how much data to read
  *
  * Currently this assumes we can read the whole reply in a single gulp.
  */
@@ -965,7 +957,6 @@ static void xs_local_data_ready(struct sock *sk)
 /**
  * xs_udp_data_ready - "data ready" callback for UDP sockets
  * @sk: socket with data to read
- * @len: how much data to read
  *
  */
 static void xs_udp_data_ready(struct sock *sk)
@@ -1389,7 +1380,6 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns
 /**
  * xs_tcp_data_ready - "data ready" callback for TCP sockets
  * @sk: socket with data to read
- * @bytes: how much data to read
  *
  */
 static void xs_tcp_data_ready(struct sock *sk)
@@ -1886,9 +1876,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
 
 /**
  * xs_local_setup_socket - create AF_LOCAL socket, connect to a local endpoint
- * @xprt: RPC transport to connect
  * @transport: socket transport to connect
- * @create_sock: function to create a socket of the correct type
  */
 static int xs_local_setup_socket(struct sock_xprt *transport)
 {
@@ -1960,43 +1948,84 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task)
                msleep_interruptible(15000);
 }
 
-#ifdef CONFIG_SUNRPC_SWAP
+#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
+/*
+ * Note that this should be called with XPRT_LOCKED held (or when we otherwise
+ * know that we have exclusive access to the socket), to guard against
+ * races with xs_reset_transport.
+ */
 static void xs_set_memalloc(struct rpc_xprt *xprt)
 {
        struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
                        xprt);
 
-       if (xprt->swapper)
+       /*
+        * If there's no sock, then we have nothing to set. The
+        * reconnecting process will get it for us.
+        */
+       if (!transport->inet)
+               return;
+       if (atomic_read(&xprt->swapper))
                sk_set_memalloc(transport->inet);
 }
 
 /**
- * xs_swapper - Tag this transport as being used for swap.
+ * xs_enable_swap - Tag this transport as being used for swap.
  * @xprt: transport to tag
- * @enable: enable/disable
  *
+ * Take a reference to this transport on behalf of the rpc_clnt, and
+ * optionally mark it for swapping if it wasn't already.
  */
-int xs_swapper(struct rpc_xprt *xprt, int enable)
+static int
+xs_enable_swap(struct rpc_xprt *xprt)
 {
-       struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
-                       xprt);
-       int err = 0;
+       struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
 
-       if (enable) {
-               xprt->swapper++;
-               xs_set_memalloc(xprt);
-       } else if (xprt->swapper) {
-               xprt->swapper--;
-               sk_clear_memalloc(transport->inet);
-       }
+       if (atomic_inc_return(&xprt->swapper) != 1)
+               return 0;
+       if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE))
+               return -ERESTARTSYS;
+       if (xs->inet)
+               sk_set_memalloc(xs->inet);
+       xprt_release_xprt(xprt, NULL);
+       return 0;
+}
 
-       return err;
+/**
+ * xs_disable_swap - Untag this transport as being used for swap.
+ * @xprt: transport to tag
+ *
+ * Drop a "swapper" reference to this xprt on behalf of the rpc_clnt. If the
+ * swapper refcount goes to 0, untag the socket as a memalloc socket.
+ */
+static void
+xs_disable_swap(struct rpc_xprt *xprt)
+{
+       struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
+
+       if (!atomic_dec_and_test(&xprt->swapper))
+               return;
+       if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE))
+               return;
+       if (xs->inet)
+               sk_clear_memalloc(xs->inet);
+       xprt_release_xprt(xprt, NULL);
 }
-EXPORT_SYMBOL_GPL(xs_swapper);
 #else
 static void xs_set_memalloc(struct rpc_xprt *xprt)
 {
 }
+
+static int
+xs_enable_swap(struct rpc_xprt *xprt)
+{
+       return -EINVAL;
+}
+
+static void
+xs_disable_swap(struct rpc_xprt *xprt)
+{
+}
 #endif
 
 static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
@@ -2057,6 +2086,27 @@ out:
        xprt_wake_pending_tasks(xprt, status);
 }
 
+/**
+ * xs_tcp_shutdown - gracefully shut down a TCP socket
+ * @xprt: transport
+ *
+ * Initiates a graceful shutdown of the TCP socket by calling the
+ * equivalent of shutdown(SHUT_RDWR);
+ */
+static void xs_tcp_shutdown(struct rpc_xprt *xprt)
+{
+       struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+       struct socket *sock = transport->sock;
+
+       if (sock == NULL)
+               return;
+       if (xprt_connected(xprt)) {
+               kernel_sock_shutdown(sock, SHUT_RDWR);
+               trace_rpc_socket_shutdown(xprt, sock);
+       } else
+               xs_reset_transport(transport);
+}
+
 static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 {
        struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -2067,6 +2117,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
                unsigned int keepidle = xprt->timeout->to_initval / HZ;
                unsigned int keepcnt = xprt->timeout->to_retries + 1;
                unsigned int opt_on = 1;
+               unsigned int timeo;
 
                /* TCP Keepalive options */
                kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
@@ -2078,6 +2129,12 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
                kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
                                (char *)&keepcnt, sizeof(keepcnt));
 
+               /* TCP user timeout (see RFC5482) */
+               timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
+                       (xprt->timeout->to_retries + 1);
+               kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
+                               (char *)&timeo, sizeof(timeo));
+
                write_lock_bh(&sk->sk_callback_lock);
 
                xs_save_old_callbacks(transport, sk);
@@ -2125,9 +2182,6 @@ out:
 
 /**
  * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint
- * @xprt: RPC transport to connect
- * @transport: socket transport to connect
- * @create_sock: function to create a socket of the correct type
  *
  * Invoked by a work queue tasklet.
  */
@@ -2463,6 +2517,8 @@ static struct rpc_xprt_ops xs_local_ops = {
        .close                  = xs_close,
        .destroy                = xs_destroy,
        .print_stats            = xs_local_print_stats,
+       .enable_swap            = xs_enable_swap,
+       .disable_swap           = xs_disable_swap,
 };
 
 static struct rpc_xprt_ops xs_udp_ops = {
@@ -2482,6 +2538,9 @@ static struct rpc_xprt_ops xs_udp_ops = {
        .close                  = xs_close,
        .destroy                = xs_destroy,
        .print_stats            = xs_udp_print_stats,
+       .enable_swap            = xs_enable_swap,
+       .disable_swap           = xs_disable_swap,
+       .inject_disconnect      = xs_inject_disconnect,
 };
 
 static struct rpc_xprt_ops xs_tcp_ops = {
@@ -2498,6 +2557,9 @@ static struct rpc_xprt_ops xs_tcp_ops = {
        .close                  = xs_tcp_shutdown,
        .destroy                = xs_destroy,
        .print_stats            = xs_tcp_print_stats,
+       .enable_swap            = xs_enable_swap,
+       .disable_swap           = xs_disable_swap,
+       .inject_disconnect      = xs_inject_disconnect,
 };
 
 /*
@@ -2515,6 +2577,9 @@ static struct rpc_xprt_ops bc_tcp_ops = {
        .close                  = bc_close,
        .destroy                = bc_destroy,
        .print_stats            = xs_tcp_print_stats,
+       .enable_swap            = xs_enable_swap,
+       .disable_swap           = xs_disable_swap,
+       .inject_disconnect      = xs_inject_disconnect,
 };
 
 static int xs_init_anyaddr(const int family, struct sockaddr *sap)