Merge branch 'for-linus' of git://linux-nfs.org/~bfields/linux
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 2 Feb 2008 03:31:28 +0000 (14:31 +1100)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 2 Feb 2008 03:31:28 +0000 (14:31 +1100)
* 'for-linus' of git://linux-nfs.org/~bfields/linux: (100 commits)
  SUNRPC: RPC program information is stored in unsigned integers
  SUNRPC: Move exported symbol definitions after function declaration part 2
  NLM: tear down RPC clients in nlm_shutdown_hosts
  SUNRPC: spin svc_rqst initialization to its own function
  nfsd: more careful input validation in nfsctl write methods
  lockd: minor log message fix
  knfsd: don't bother mapping putrootfh enoent to eperm
  rdma: makefile
  rdma: ONCRPC RDMA protocol marshalling
  rdma: SVCRDMA sendto
  rdma: SVCRDMA recvfrom
  rdma: SVCRDMA Core Transport Services
  rdma: SVCRDMA Transport Module
  rdma: SVCRMDA Header File
  svc: Add svc_xprt_names service to replace svc_sock_names
  knfsd: Support adding transports by writing portlist file
  svc: Add svc API that queries for a transport instance
  svc: Add /proc/sys/sunrpc/transport files
  svc: Add transport hdr size for defer/revisit
  svc: Move the xprt independent code to the svc_xprt.c file
  ...

60 files changed:
MAINTAINERS
fs/Kconfig
fs/lockd/host.c
fs/lockd/svc.c
fs/lockd/svc4proc.c
fs/lockd/svclock.c
fs/lockd/svcproc.c
fs/lockd/svcsubs.c
fs/nfs/callback.c
fs/nfsd/auth.h [moved from include/linux/nfsd/auth.h with 87% similarity]
fs/nfsd/export.c
fs/nfsd/nfs2acl.c
fs/nfsd/nfs3xdr.c
fs/nfsd/nfs4callback.c
fs/nfsd/nfs4idmap.c
fs/nfsd/nfs4proc.c
fs/nfsd/nfs4state.c
fs/nfsd/nfs4xdr.c
fs/nfsd/nfscache.c
fs/nfsd/nfsctl.c
fs/nfsd/nfsfh.c
fs/nfsd/nfssvc.c
fs/nfsd/nfsxdr.c
fs/nfsd/vfs.c
include/linux/lockd/lockd.h
include/linux/lockd/xdr.h
include/linux/nfsd/Kbuild
include/linux/nfsd/cache.h
include/linux/nfsd/export.h
include/linux/nfsd/nfsd.h
include/linux/nfsd/syscall.h
include/linux/nfsd/xdr.h
include/linux/nfsd/xdr3.h
include/linux/nfsd/xdr4.h
include/linux/nfsd_idmap.h
include/linux/sunrpc/cache.h
include/linux/sunrpc/debug.h
include/linux/sunrpc/svc.h
include/linux/sunrpc/svc_rdma.h [new file with mode: 0644]
include/linux/sunrpc/svc_xprt.h [new file with mode: 0644]
include/linux/sunrpc/svcsock.h
include/linux/sunrpc/xdr.h
net/sunrpc/Makefile
net/sunrpc/auth_gss/svcauth_gss.c
net/sunrpc/cache.c
net/sunrpc/stats.c
net/sunrpc/sunrpc_syms.c
net/sunrpc/svc.c
net/sunrpc/svc_xprt.c [new file with mode: 0644]
net/sunrpc/svcauth.c
net/sunrpc/svcauth_unix.c
net/sunrpc/svcsock.c
net/sunrpc/sysctl.c
net/sunrpc/xdr.c
net/sunrpc/xprtrdma/Makefile
net/sunrpc/xprtrdma/svc_rdma.c [new file with mode: 0644]
net/sunrpc/xprtrdma/svc_rdma_marshal.c [new file with mode: 0644]
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c [new file with mode: 0644]
net/sunrpc/xprtrdma/svc_rdma_sendto.c [new file with mode: 0644]
net/sunrpc/xprtrdma/svc_rdma_transport.c [new file with mode: 0644]

index 91082e60d2897df6675edad92af8105a2979a19a..6cae13718925d71cc3996400f3d0d52bca1fbaba 100644 (file)
@@ -2247,7 +2247,7 @@ P:        J. Bruce Fields
 M:     bfields@fieldses.org
 P:     Neil Brown
 M:     neilb@suse.de
-L:     nfs@lists.sourceforge.net
+L:     linux-nfs@vger.kernel.org
 W:     http://nfs.sourceforge.net/
 S:     Supported
 
index 219ec06a8c7e2cd057a28898c4827776a1abf0c3..987b5d7cb21a28eb875a7337c734276084a5f5ab 100644 (file)
@@ -1674,6 +1674,8 @@ config NFSD
        select CRYPTO_MD5 if NFSD_V4
        select CRYPTO if NFSD_V4
        select FS_POSIX_ACL if NFSD_V4
+       select PROC_FS if NFSD_V4
+       select PROC_FS if SUNRPC_GSS
        help
          If you want your Linux box to act as an NFS *server*, so that other
          computers on your local network which support NFS can access certain
index 572601e98dcdec6d6d88da33ec0b757bd8b7d3f4..ca6b16fc3101a3df88d60350060eae56d6528535 100644 (file)
@@ -34,10 +34,10 @@ static DEFINE_MUTEX(nlm_host_mutex);
 
 static void                    nlm_gc_hosts(void);
 static struct nsm_handle *     __nsm_find(const struct sockaddr_in *,
-                                       const char *, int, int);
+                                       const char *, unsigned int, int);
 static struct nsm_handle *     nsm_find(const struct sockaddr_in *sin,
                                         const char *hostname,
-                                        int hostname_len);
+                                        unsigned int hostname_len);
 
 /*
  * Common host lookup routine for server & client
@@ -45,7 +45,8 @@ static struct nsm_handle *    nsm_find(const struct sockaddr_in *sin,
 static struct nlm_host *
 nlm_lookup_host(int server, const struct sockaddr_in *sin,
                int proto, int version, const char *hostname,
-               int hostname_len, const struct sockaddr_in *ssin)
+               unsigned int hostname_len,
+               const struct sockaddr_in *ssin)
 {
        struct hlist_head *chain;
        struct hlist_node *pos;
@@ -176,7 +177,7 @@ nlm_destroy_host(struct nlm_host *host)
  */
 struct nlm_host *
 nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
-                       const char *hostname, int hostname_len)
+                       const char *hostname, unsigned int hostname_len)
 {
        struct sockaddr_in ssin = {0};
 
@@ -189,7 +190,7 @@ nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
  */
 struct nlm_host *
 nlmsvc_lookup_host(struct svc_rqst *rqstp,
-                       const char *hostname, int hostname_len)
+                       const char *hostname, unsigned int hostname_len)
 {
        struct sockaddr_in ssin = {0};
 
@@ -307,7 +308,8 @@ void nlm_release_host(struct nlm_host *host)
  * Release all resources held by that peer.
  */
 void nlm_host_rebooted(const struct sockaddr_in *sin,
-                               const char *hostname, int hostname_len,
+                               const char *hostname,
+                               unsigned int hostname_len,
                                u32 new_state)
 {
        struct hlist_head *chain;
@@ -377,8 +379,13 @@ nlm_shutdown_hosts(void)
        /* First, make all hosts eligible for gc */
        dprintk("lockd: nuking all hosts...\n");
        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
-               hlist_for_each_entry(host, pos, chain, h_hash)
+               hlist_for_each_entry(host, pos, chain, h_hash) {
                        host->h_expires = jiffies - 1;
+                       if (host->h_rpcclnt) {
+                               rpc_shutdown_client(host->h_rpcclnt);
+                               host->h_rpcclnt = NULL;
+                       }
+               }
        }
 
        /* Then, perform a garbage collection pass */
@@ -449,7 +456,7 @@ static DEFINE_MUTEX(nsm_mutex);
 
 static struct nsm_handle *
 __nsm_find(const struct sockaddr_in *sin,
-               const char *hostname, int hostname_len,
+               const char *hostname, unsigned int hostname_len,
                int create)
 {
        struct nsm_handle *nsm = NULL;
@@ -503,7 +510,8 @@ out:
 }
 
 static struct nsm_handle *
-nsm_find(const struct sockaddr_in *sin, const char *hostname, int hostname_len)
+nsm_find(const struct sockaddr_in *sin, const char *hostname,
+        unsigned int hostname_len)
 {
        return __nsm_find(sin, hostname, hostname_len, 1);
 }
index 82e2192a0d5c873a72b867702c511b0c8872f6a7..08226464e5638f1c9ec0636124800609ffb09b84 100644 (file)
@@ -219,19 +219,6 @@ lockd(struct svc_rqst *rqstp)
        module_put_and_exit(0);
 }
 
-
-static int find_socket(struct svc_serv *serv, int proto)
-{
-       struct svc_sock *svsk;
-       int found = 0;
-       list_for_each_entry(svsk, &serv->sv_permsocks, sk_list)
-               if (svsk->sk_sk->sk_protocol == proto) {
-                       found = 1;
-                       break;
-               }
-       return found;
-}
-
 /*
  * Make any sockets that are needed but not present.
  * If nlm_udpport or nlm_tcpport were set as module
@@ -240,17 +227,25 @@ static int find_socket(struct svc_serv *serv, int proto)
 static int make_socks(struct svc_serv *serv, int proto)
 {
        static int warned;
+       struct svc_xprt *xprt;
        int err = 0;
 
-       if (proto == IPPROTO_UDP || nlm_udpport)
-               if (!find_socket(serv, IPPROTO_UDP))
-                       err = svc_makesock(serv, IPPROTO_UDP, nlm_udpport,
-                                               SVC_SOCK_DEFAULTS);
-       if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport))
-               if (!find_socket(serv, IPPROTO_TCP))
-                       err = svc_makesock(serv, IPPROTO_TCP, nlm_tcpport,
-                                               SVC_SOCK_DEFAULTS);
-
+       if (proto == IPPROTO_UDP || nlm_udpport) {
+               xprt = svc_find_xprt(serv, "udp", 0, 0);
+               if (!xprt)
+                       err = svc_create_xprt(serv, "udp", nlm_udpport,
+                                             SVC_SOCK_DEFAULTS);
+               else
+                       svc_xprt_put(xprt);
+       }
+       if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) {
+               xprt = svc_find_xprt(serv, "tcp", 0, 0);
+               if (!xprt)
+                       err = svc_create_xprt(serv, "tcp", nlm_tcpport,
+                                             SVC_SOCK_DEFAULTS);
+               else
+                       svc_xprt_put(xprt);
+       }
        if (err >= 0) {
                warned = 0;
                err = 0;
index bf27b6c6cb6b6219f0518d856d77796ed9694828..385437e3387de895fde1540f58c7d3269a842373 100644 (file)
@@ -84,6 +84,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
        struct nlm_host *host;
        struct nlm_file *file;
+       int rc = rpc_success;
 
        dprintk("lockd: TEST4        called\n");
        resp->cookie = argp->cookie;
@@ -91,7 +92,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Don't accept test requests during grace period */
        if (nlmsvc_grace_period) {
                resp->status = nlm_lck_denied_grace_period;
-               return rpc_success;
+               return rc;
        }
 
        /* Obtain client and file */
@@ -101,12 +102,13 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Now check for conflicting locks */
        resp->status = nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie);
        if (resp->status == nlm_drop_reply)
-               return rpc_drop_reply;
+               rc = rpc_drop_reply;
+       else
+               dprintk("lockd: TEST4        status %d\n", ntohl(resp->status));
 
-       dprintk("lockd: TEST4          status %d\n", ntohl(resp->status));
        nlm_release_host(host);
        nlm_release_file(file);
-       return rpc_success;
+       return rc;
 }
 
 static __be32
@@ -115,6 +117,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
        struct nlm_host *host;
        struct nlm_file *file;
+       int rc = rpc_success;
 
        dprintk("lockd: LOCK          called\n");
 
@@ -123,7 +126,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Don't accept new lock requests during grace period */
        if (nlmsvc_grace_period && !argp->reclaim) {
                resp->status = nlm_lck_denied_grace_period;
-               return rpc_success;
+               return rc;
        }
 
        /* Obtain client and file */
@@ -146,12 +149,13 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_lock(rqstp, file, &argp->lock,
                                        argp->block, &argp->cookie);
        if (resp->status == nlm_drop_reply)
-               return rpc_drop_reply;
+               rc = rpc_drop_reply;
+       else
+               dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
 
-       dprintk("lockd: LOCK          status %d\n", ntohl(resp->status));
        nlm_release_host(host);
        nlm_release_file(file);
-       return rpc_success;
+       return rc;
 }
 
 static __be32
index d120ec39bcb0ea5c472fc7c1f3ba277b9308709d..2f4d8fa666892b9fa8b41266a627608c55c1e8a0 100644 (file)
@@ -501,25 +501,29 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
                        block, block->b_flags, block->b_fl);
                if (block->b_flags & B_TIMED_OUT) {
                        nlmsvc_unlink_block(block);
-                       return nlm_lck_denied;
+                       ret = nlm_lck_denied;
+                       goto out;
                }
                if (block->b_flags & B_GOT_CALLBACK) {
+                       nlmsvc_unlink_block(block);
                        if (block->b_fl != NULL
                                        && block->b_fl->fl_type != F_UNLCK) {
                                lock->fl = *block->b_fl;
                                goto conf_lock;
-                       }
-                       else {
-                               nlmsvc_unlink_block(block);
-                               return nlm_granted;
+                       } else {
+                               ret = nlm_granted;
+                               goto out;
                        }
                }
-               return nlm_drop_reply;
+               ret = nlm_drop_reply;
+               goto out;
        }
 
        error = vfs_test_lock(file->f_file, &lock->fl);
-       if (error == -EINPROGRESS)
-               return nlmsvc_defer_lock_rqst(rqstp, block);
+       if (error == -EINPROGRESS) {
+               ret = nlmsvc_defer_lock_rqst(rqstp, block);
+               goto out;
+       }
        if (error) {
                ret = nlm_lck_denied_nolocks;
                goto out;
index 9cd5c8b37593f099d3c64da583085b733175c038..88379cc6e0b1c0ee196c8686c3f0e7c46cabef9a 100644 (file)
@@ -113,6 +113,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
        struct nlm_host *host;
        struct nlm_file *file;
+       int rc = rpc_success;
 
        dprintk("lockd: TEST          called\n");
        resp->cookie = argp->cookie;
@@ -120,7 +121,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Don't accept test requests during grace period */
        if (nlmsvc_grace_period) {
                resp->status = nlm_lck_denied_grace_period;
-               return rpc_success;
+               return rc;
        }
 
        /* Obtain client and file */
@@ -130,13 +131,14 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Now check for conflicting locks */
        resp->status = cast_status(nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie));
        if (resp->status == nlm_drop_reply)
-               return rpc_drop_reply;
+               rc = rpc_drop_reply;
+       else
+               dprintk("lockd: TEST          status %d vers %d\n",
+                       ntohl(resp->status), rqstp->rq_vers);
 
-       dprintk("lockd: TEST          status %d vers %d\n",
-               ntohl(resp->status), rqstp->rq_vers);
        nlm_release_host(host);
        nlm_release_file(file);
-       return rpc_success;
+       return rc;
 }
 
 static __be32
@@ -145,6 +147,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
        struct nlm_host *host;
        struct nlm_file *file;
+       int rc = rpc_success;
 
        dprintk("lockd: LOCK          called\n");
 
@@ -153,7 +156,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Don't accept new lock requests during grace period */
        if (nlmsvc_grace_period && !argp->reclaim) {
                resp->status = nlm_lck_denied_grace_period;
-               return rpc_success;
+               return rc;
        }
 
        /* Obtain client and file */
@@ -176,12 +179,13 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_lock(rqstp, file, &argp->lock,
                                               argp->block, &argp->cookie));
        if (resp->status == nlm_drop_reply)
-               return rpc_drop_reply;
+               rc = rpc_drop_reply;
+       else
+               dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
 
-       dprintk("lockd: LOCK          status %d\n", ntohl(resp->status));
        nlm_release_host(host);
        nlm_release_file(file);
-       return rpc_success;
+       return rc;
 }
 
 static __be32
index 84ebba33b98d0ae2567cb79c45adbae5f9d751fb..dbbefbcd671255935c61783e5cf73d4122346cd7 100644 (file)
@@ -87,7 +87,7 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
        unsigned int    hash;
        __be32          nfserr;
 
-       nlm_debug_print_fh("nlm_file_lookup", f);
+       nlm_debug_print_fh("nlm_lookup_file", f);
 
        hash = file_hash(f);
 
index 9b6bbf1b978795b8ba29668db9313f1e86bb0d36..bd185a572a23aeb0d015b701d0786dab359f260a 100644 (file)
@@ -119,8 +119,8 @@ int nfs_callback_up(void)
        if (!serv)
                goto out_err;
 
-       ret = svc_makesock(serv, IPPROTO_TCP, nfs_callback_set_tcpport,
-                                                       SVC_SOCK_ANONYMOUS);
+       ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport,
+                             SVC_SOCK_ANONYMOUS);
        if (ret <= 0)
                goto out_destroy;
        nfs_callback_tcpport = ret;
similarity index 87%
rename from include/linux/nfsd/auth.h
rename to fs/nfsd/auth.h
index 0fb9f7212195383e47c5354b33e7399ee29a75c0..78b3c0e9382279a5205cbb3e90dc86811a85a10c 100644 (file)
@@ -1,6 +1,4 @@
 /*
- * include/linux/nfsd/auth.h
- *
  * nfsd-specific authentication stuff.
  * uid/gid mapping not yet implemented.
  *
@@ -10,8 +8,6 @@
 #ifndef LINUX_NFSD_AUTH_H
 #define LINUX_NFSD_AUTH_H
 
-#ifdef __KERNEL__
-
 #define nfsd_luid(rq, uid)     ((u32)(uid))
 #define nfsd_lgid(rq, gid)     ((u32)(gid))
 #define nfsd_ruid(rq, uid)     ((u32)(uid))
@@ -23,5 +19,4 @@
  */
 int nfsd_setuser(struct svc_rqst *, struct svc_export *);
 
-#endif /* __KERNEL__ */
 #endif /* LINUX_NFSD_AUTH_H */
index 66d0aeb32a47e5f4385e68c1947a8d8c1ab3a7e5..79b4bf8129602b9ebbbec979960d614e90b7185b 100644 (file)
@@ -1357,8 +1357,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
        mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
 
        exp = rqst_exp_find(rqstp, FSID_NUM, fsidv);
-       if (PTR_ERR(exp) == -ENOENT)
-               return nfserr_perm;
        if (IS_ERR(exp))
                return nfserrno(PTR_ERR(exp));
        rv = fh_compose(fhp, exp, exp->ex_dentry, NULL);
@@ -1637,13 +1635,19 @@ exp_verify_string(char *cp, int max)
 /*
  * Initialize the exports module.
  */
-void
+int
 nfsd_export_init(void)
 {
+       int rv;
        dprintk("nfsd: initializing export module.\n");
 
-       cache_register(&svc_export_cache);
-       cache_register(&svc_expkey_cache);
+       rv = cache_register(&svc_export_cache);
+       if (rv)
+               return rv;
+       rv = cache_register(&svc_expkey_cache);
+       if (rv)
+               cache_unregister(&svc_export_cache);
+       return rv;
 
 }
 
@@ -1670,10 +1674,8 @@ nfsd_export_shutdown(void)
 
        exp_writelock();
 
-       if (cache_unregister(&svc_expkey_cache))
-               printk(KERN_ERR "nfsd: failed to unregister expkey cache\n");
-       if (cache_unregister(&svc_export_cache))
-               printk(KERN_ERR "nfsd: failed to unregister export cache\n");
+       cache_unregister(&svc_expkey_cache);
+       cache_unregister(&svc_export_cache);
        svcauth_unix_purge();
 
        exp_writeunlock();
index 0e5fa11e6b44c4fa114e5e48ef0408a4227a5c07..1c3b7654e966d9f02cd46d8284ee636e670dd882 100644 (file)
@@ -221,12 +221,17 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd3_getaclres *resp)
 {
        struct dentry *dentry = resp->fh.fh_dentry;
-       struct inode *inode = dentry->d_inode;
+       struct inode *inode;
        struct kvec *head = rqstp->rq_res.head;
        unsigned int base;
        int n;
        int w;
 
+       /*
+        * Since this is version 2, the check for nfserr in
+        * nfsd_dispatch actually ensures the following cannot happen.
+        * However, it seems fragile to depend on that.
+        */
        if (dentry == NULL || dentry->d_inode == NULL)
                return 0;
        inode = dentry->d_inode;
index f917fd25858af81a4edaf712fffa3480ee3a8e6b..d7647f70e02b0b2cf219577a67ebdff68ce73116 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/xdr3.h>
+#include "auth.h"
 
 #define NFSDDBG_FACILITY               NFSDDBG_XDR
 
@@ -88,10 +89,10 @@ encode_fh(__be32 *p, struct svc_fh *fhp)
  * no slashes or null bytes.
  */
 static __be32 *
-decode_filename(__be32 *p, char **namp, int *lenp)
+decode_filename(__be32 *p, char **namp, unsigned int *lenp)
 {
        char            *name;
-       int             i;
+       unsigned int    i;
 
        if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) {
                for (i = 0, name = *namp; i < *lenp; i++, name++) {
@@ -452,8 +453,7 @@ int
 nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_symlinkargs *args)
 {
-       unsigned int len;
-       int avail;
+       unsigned int len, avail;
        char *old, *new;
        struct kvec *vec;
 
@@ -486,7 +486,8 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
        /* now copy next page if there is one */
        if (len && !avail && rqstp->rq_arg.page_len) {
                avail = rqstp->rq_arg.page_len;
-               if (avail > PAGE_SIZE) avail = PAGE_SIZE;
+               if (avail > PAGE_SIZE)
+                       avail = PAGE_SIZE;
                old = page_address(rqstp->rq_arg.pages[0]);
        }
        while (len && avail && *old) {
@@ -816,11 +817,11 @@ static __be32 *
 encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p,
                struct svc_fh *fhp)
 {
-               p = encode_post_op_attr(cd->rqstp, p, fhp);
-               *p++ = xdr_one;                 /* yes, a file handle follows */
-               p = encode_fh(p, fhp);
-               fh_put(fhp);
-               return p;
+       p = encode_post_op_attr(cd->rqstp, p, fhp);
+       *p++ = xdr_one;                 /* yes, a file handle follows */
+       p = encode_fh(p, fhp);
+       fh_put(fhp);
+       return p;
 }
 
 static int
index 9d536a8cb3795651551af7b228407a09d890f7b7..aae2b29ae2c9e6d4b645f91bf68b50046030accc 100644 (file)
@@ -350,30 +350,6 @@ static struct rpc_version *        nfs_cb_version[] = {
 static int do_probe_callback(void *data)
 {
        struct nfs4_client *clp = data;
-       struct nfs4_callback *cb = &clp->cl_callback;
-       struct rpc_message msg = {
-               .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
-               .rpc_argp       = clp,
-       };
-       int status;
-
-       status = rpc_call_sync(cb->cb_client, &msg, RPC_TASK_SOFT);
-
-       if (status) {
-               rpc_shutdown_client(cb->cb_client);
-               cb->cb_client = NULL;
-       } else
-               atomic_set(&cb->cb_set, 1);
-       put_nfs4_client(clp);
-       return 0;
-}
-
-/*
- * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
- */
-void
-nfsd4_probe_callback(struct nfs4_client *clp)
-{
        struct sockaddr_in      addr;
        struct nfs4_callback    *cb = &clp->cl_callback;
        struct rpc_timeout      timeparms = {
@@ -390,13 +366,15 @@ nfsd4_probe_callback(struct nfs4_client *clp)
                .timeout        = &timeparms,
                .program        = program,
                .version        = nfs_cb_version[1]->number,
-               .authflavor     = RPC_AUTH_UNIX,        /* XXX: need AUTH_GSS... */
+               .authflavor     = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
                .flags          = (RPC_CLNT_CREATE_NOPING),
        };
-       struct task_struct *t;
-
-       if (atomic_read(&cb->cb_set))
-               return;
+       struct rpc_message msg = {
+               .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
+               .rpc_argp       = clp,
+       };
+       struct rpc_clnt *client;
+       int status;
 
        /* Initialize address */
        memset(&addr, 0, sizeof(addr));
@@ -416,29 +394,50 @@ nfsd4_probe_callback(struct nfs4_client *clp)
        program->stats->program = program;
 
        /* Create RPC client */
-       cb->cb_client = rpc_create(&args);
-       if (IS_ERR(cb->cb_client)) {
+       client = rpc_create(&args);
+       if (IS_ERR(client)) {
                dprintk("NFSD: couldn't create callback client\n");
+               status = PTR_ERR(client);
                goto out_err;
        }
 
+       status = rpc_call_sync(client, &msg, RPC_TASK_SOFT);
+
+       if (status)
+               goto out_release_client;
+
+       cb->cb_client = client;
+       atomic_set(&cb->cb_set, 1);
+       put_nfs4_client(clp);
+       return 0;
+out_release_client:
+       rpc_shutdown_client(client);
+out_err:
+       put_nfs4_client(clp);
+       dprintk("NFSD: warning: no callback path to client %.*s\n",
+               (int)clp->cl_name.len, clp->cl_name.data);
+       return status;
+}
+
+/*
+ * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
+ */
+void
+nfsd4_probe_callback(struct nfs4_client *clp)
+{
+       struct task_struct *t;
+
+       BUG_ON(atomic_read(&clp->cl_callback.cb_set));
+
        /* the task holds a reference to the nfs4_client struct */
        atomic_inc(&clp->cl_count);
 
        t = kthread_run(do_probe_callback, clp, "nfs4_cb_probe");
 
        if (IS_ERR(t))
-               goto out_release_clp;
+               atomic_dec(&clp->cl_count);
 
        return;
-
-out_release_clp:
-       atomic_dec(&clp->cl_count);
-       rpc_shutdown_client(cb->cb_client);
-out_err:
-       cb->cb_client = NULL;
-       dprintk("NFSD: warning: no callback path to client %.*s\n",
-               (int)clp->cl_name.len, clp->cl_name.data);
 }
 
 /*
@@ -458,9 +457,6 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
        int retries = 1;
        int status = 0;
 
-       if ((!atomic_read(&clp->cl_callback.cb_set)) || !clnt)
-               return;
-
        cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */
        cbr->cbr_dp = dp;
 
@@ -469,6 +465,7 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
                switch (status) {
                        case -EIO:
                                /* Network partition? */
+                               atomic_set(&clp->cl_callback.cb_set, 0);
                        case -EBADHANDLE:
                        case -NFS4ERR_BAD_STATEID:
                                /* Race: client probably got cb_recall
@@ -481,11 +478,10 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
                status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
        }
 out_put_cred:
-       if (status == -EIO)
-               atomic_set(&clp->cl_callback.cb_set, 0);
-       /* Success or failure, now we're either waiting for lease expiration
-        * or deleg_return. */
-       dprintk("NFSD: nfs4_cb_recall: dp %p dl_flock %p dl_count %d\n",dp, dp->dl_flock, atomic_read(&dp->dl_count));
+       /*
+        * Success or failure, now we're either waiting for lease expiration
+        * or deleg_return.
+        */
        put_nfs4_client(clp);
        nfs4_put_delegation(dp);
        return;
index 4c0c683ce07a8be9852f06370dfad35d3591a45a..996bd88b75ba4d4e094129d6e856c561f6843b57 100644 (file)
@@ -255,13 +255,10 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
                goto out;
        if (len == 0)
                set_bit(CACHE_NEGATIVE, &ent.h.flags);
-       else {
-               if (error >= IDMAP_NAMESZ) {
-                       error = -EINVAL;
-                       goto out;
-               }
+       else if (len >= IDMAP_NAMESZ)
+               goto out;
+       else
                memcpy(ent.name, buf1, sizeof(ent.name));
-       }
        error = -ENOMEM;
        res = idtoname_update(&ent, res);
        if (res == NULL)
@@ -467,20 +464,25 @@ nametoid_update(struct ent *new, struct ent *old)
  * Exported API
  */
 
-void
+int
 nfsd_idmap_init(void)
 {
-       cache_register(&idtoname_cache);
-       cache_register(&nametoid_cache);
+       int rv;
+
+       rv = cache_register(&idtoname_cache);
+       if (rv)
+               return rv;
+       rv = cache_register(&nametoid_cache);
+       if (rv)
+               cache_unregister(&idtoname_cache);
+       return rv;
 }
 
 void
 nfsd_idmap_shutdown(void)
 {
-       if (cache_unregister(&idtoname_cache))
-               printk(KERN_ERR "nfsd: failed to unregister idtoname cache\n");
-       if (cache_unregister(&nametoid_cache))
-               printk(KERN_ERR "nfsd: failed to unregister nametoid cache\n");
+       cache_unregister(&idtoname_cache);
+       cache_unregister(&nametoid_cache);
 }
 
 /*
index 18ead1790bb388461b08d42e3e5fcc5c1d1807cd..c593db047d8bbd51babb22bbacc765f65636ff68 100644 (file)
@@ -750,7 +750,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                                    cstate->current_fh.fh_export,
                                    cstate->current_fh.fh_dentry, buf,
                                    &count, verify->ve_bmval,
-                                   rqstp);
+                                   rqstp, 0);
 
        /* this means that nfsd4_encode_fattr() ran out of space */
        if (status == nfserr_resource && count == 0)
index 31673cd251c3c2936b99e566a34773f5a0dce0c1..f6744bc03dae28e5249132ccc719a47e25ffeb63 100644 (file)
@@ -61,7 +61,6 @@ static time_t lease_time = 90;     /* default lease time */
 static time_t user_lease_time = 90;
 static time_t boot_time;
 static int in_grace = 1;
-static u32 current_clientid = 1;
 static u32 current_ownerid = 1;
 static u32 current_fileid = 1;
 static u32 current_delegid = 1;
@@ -340,21 +339,20 @@ STALE_CLIENTID(clientid_t *clid)
  * This type of memory management is somewhat inefficient, but we use it
  * anyway since SETCLIENTID is not a common operation.
  */
-static inline struct nfs4_client *
-alloc_client(struct xdr_netobj name)
+static struct nfs4_client *alloc_client(struct xdr_netobj name)
 {
        struct nfs4_client *clp;
 
-       if ((clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL))!= NULL) {
-               if ((clp->cl_name.data = kmalloc(name.len, GFP_KERNEL)) != NULL) {
-                       memcpy(clp->cl_name.data, name.data, name.len);
-                       clp->cl_name.len = name.len;
-               }
-               else {
-                       kfree(clp);
-                       clp = NULL;
-               }
+       clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
+       if (clp == NULL)
+               return NULL;
+       clp->cl_name.data = kmalloc(name.len, GFP_KERNEL);
+       if (clp->cl_name.data == NULL) {
+               kfree(clp);
+               return NULL;
        }
+       memcpy(clp->cl_name.data, name.data, name.len);
+       clp->cl_name.len = name.len;
        return clp;
 }
 
@@ -363,8 +361,11 @@ shutdown_callback_client(struct nfs4_client *clp)
 {
        struct rpc_clnt *clnt = clp->cl_callback.cb_client;
 
-       /* shutdown rpc client, ending any outstanding recall rpcs */
        if (clnt) {
+               /*
+                * Callback threads take a reference on the client, so there
+                * should be no outstanding callbacks at this point.
+                */
                clp->cl_callback.cb_client = NULL;
                rpc_shutdown_client(clnt);
        }
@@ -422,12 +423,13 @@ expire_client(struct nfs4_client *clp)
        put_nfs4_client(clp);
 }
 
-static struct nfs4_client *
-create_client(struct xdr_netobj name, char *recdir) {
+static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
+{
        struct nfs4_client *clp;
 
-       if (!(clp = alloc_client(name)))
-               goto out;
+       clp = alloc_client(name);
+       if (clp == NULL)
+               return NULL;
        memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
        atomic_set(&clp->cl_count, 1);
        atomic_set(&clp->cl_callback.cb_set, 0);
@@ -436,32 +438,30 @@ create_client(struct xdr_netobj name, char *recdir) {
        INIT_LIST_HEAD(&clp->cl_openowners);
        INIT_LIST_HEAD(&clp->cl_delegations);
        INIT_LIST_HEAD(&clp->cl_lru);
-out:
        return clp;
 }
 
-static void
-copy_verf(struct nfs4_client *target, nfs4_verifier *source) {
-       memcpy(target->cl_verifier.data, source->data, sizeof(target->cl_verifier.data));
+static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
+{
+       memcpy(target->cl_verifier.data, source->data,
+                       sizeof(target->cl_verifier.data));
 }
 
-static void
-copy_clid(struct nfs4_client *target, struct nfs4_client *source) {
+static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
+{
        target->cl_clientid.cl_boot = source->cl_clientid.cl_boot; 
        target->cl_clientid.cl_id = source->cl_clientid.cl_id; 
 }
 
-static void
-copy_cred(struct svc_cred *target, struct svc_cred *source) {
-
+static void copy_cred(struct svc_cred *target, struct svc_cred *source)
+{
        target->cr_uid = source->cr_uid;
        target->cr_gid = source->cr_gid;
        target->cr_group_info = source->cr_group_info;
        get_group_info(target->cr_group_info);
 }
 
-static inline int
-same_name(const char *n1, const char *n2)
+static int same_name(const char *n1, const char *n2)
 {
        return 0 == memcmp(n1, n2, HEXDIR_LEN);
 }
@@ -485,26 +485,26 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
        return cr1->cr_uid == cr2->cr_uid;
 }
 
-static void
-gen_clid(struct nfs4_client *clp) {
+static void gen_clid(struct nfs4_client *clp)
+{
+       static u32 current_clientid = 1;
+
        clp->cl_clientid.cl_boot = boot_time;
        clp->cl_clientid.cl_id = current_clientid++; 
 }
 
-static void
-gen_confirm(struct nfs4_client *clp) {
-       struct timespec         tv;
-       u32 *                   p;
+static void gen_confirm(struct nfs4_client *clp)
+{
+       static u32 i;
+       u32 *p;
 
-       tv = CURRENT_TIME;
        p = (u32 *)clp->cl_confirm.data;
-       *p++ = tv.tv_sec;
-       *p++ = tv.tv_nsec;
+       *p++ = get_seconds();
+       *p++ = i++;
 }
 
-static int
-check_name(struct xdr_netobj name) {
-
+static int check_name(struct xdr_netobj name)
+{
        if (name.len == 0) 
                return 0;
        if (name.len > NFS4_OPAQUE_LIMIT) {
@@ -683,39 +683,6 @@ out_err:
        return;
 }
 
-/*
- * RFC 3010 has a complex implmentation description of processing a 
- * SETCLIENTID request consisting of 5 bullets, labeled as 
- * CASE0 - CASE4 below.
- *
- * NOTES:
- *     callback information will be processed in a future patch
- *
- *     an unconfirmed record is added when:
- *      NORMAL (part of CASE 4): there is no confirmed nor unconfirmed record.
- *     CASE 1: confirmed record found with matching name, principal,
- *             verifier, and clientid.
- *     CASE 2: confirmed record found with matching name, principal,
- *             and there is no unconfirmed record with matching
- *             name and principal
- *
- *      an unconfirmed record is replaced when:
- *     CASE 3: confirmed record found with matching name, principal,
- *             and an unconfirmed record is found with matching 
- *             name, principal, and with clientid and
- *             confirm that does not match the confirmed record.
- *     CASE 4: there is no confirmed record with matching name and 
- *             principal. there is an unconfirmed record with 
- *             matching name, principal.
- *
- *     an unconfirmed record is deleted when:
- *     CASE 1: an unconfirmed record that matches input name, verifier,
- *             and confirmed clientid.
- *     CASE 4: any unconfirmed records with matching name and principal
- *             that exist after an unconfirmed record has been replaced
- *             as described above.
- *
- */
 __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                  struct nfsd4_setclientid *setclid)
@@ -748,11 +715,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
        conf = find_confirmed_client_by_str(dname, strhashval);
        if (conf) {
-               /* 
-                * CASE 0:
-                * clname match, confirmed, different principal
-                * or different ip_address
-                */
+               /* RFC 3530 14.2.33 CASE 0: */
                status = nfserr_clid_inuse;
                if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)
                                || conf->cl_addr != sin->sin_addr.s_addr) {
@@ -761,12 +724,17 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        goto out;
                }
        }
+       /*
+        * section 14.2.33 of RFC 3530 (under the heading "IMPLEMENTATION")
+        * has a description of SETCLIENTID request processing consisting
+        * of 5 bullet points, labeled as CASE0 - CASE4 below.
+        */
        unconf = find_unconfirmed_client_by_str(dname, strhashval);
        status = nfserr_resource;
        if (!conf) {
-               /* 
-                * CASE 4:
-                * placed first, because it is the normal case.
+               /*
+                * RFC 3530 14.2.33 CASE 4:
+                * placed first, because it is the normal case
                 */
                if (unconf)
                        expire_client(unconf);
@@ -776,17 +744,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                gen_clid(new);
        } else if (same_verf(&conf->cl_verifier, &clverifier)) {
                /*
-                * CASE 1:
-                * cl_name match, confirmed, principal match
-                * verifier match: probable callback update
-                *
-                * remove any unconfirmed nfs4_client with 
-                * matching cl_name, cl_verifier, and cl_clientid
-                *
-                * create and insert an unconfirmed nfs4_client with same 
-                * cl_name, cl_verifier, and cl_clientid as existing 
-                * nfs4_client,  but with the new callback info and a 
-                * new cl_confirm
+                * RFC 3530 14.2.33 CASE 1:
+                * probable callback update
                 */
                if (unconf) {
                        /* Note this is removing unconfirmed {*x***},
@@ -802,43 +761,25 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                copy_clid(new, conf);
        } else if (!unconf) {
                /*
-                * CASE 2:
-                * clname match, confirmed, principal match
-                * verfier does not match
-                * no unconfirmed. create a new unconfirmed nfs4_client
-                * using input clverifier, clname, and callback info
-                * and generate a new cl_clientid and cl_confirm.
+                * RFC 3530 14.2.33 CASE 2:
+                * probable client reboot; state will be removed if
+                * confirmed.
                 */
                new = create_client(clname, dname);
                if (new == NULL)
                        goto out;
                gen_clid(new);
-       } else if (!same_verf(&conf->cl_confirm, &unconf->cl_confirm)) {
-               /*      
-                * CASE3:
-                * confirmed found (name, principal match)
-                * confirmed verifier does not match input clverifier
-                *
-                * unconfirmed found (name match)
-                * confirmed->cl_confirm != unconfirmed->cl_confirm
-                *
-                * remove unconfirmed.
-                *
-                * create an unconfirmed nfs4_client 
-                * with same cl_name as existing confirmed nfs4_client, 
-                * but with new callback info, new cl_clientid,
-                * new cl_verifier and a new cl_confirm
+       } else {
+               /*
+                * RFC 3530 14.2.33 CASE 3:
+                * probable client reboot; state will be removed if
+                * confirmed.
                 */
                expire_client(unconf);
                new = create_client(clname, dname);
                if (new == NULL)
                        goto out;
                gen_clid(new);
-       } else {
-               /* No cases hit !!! */
-               status = nfserr_inval;
-               goto out;
-
        }
        copy_verf(new, &clverifier);
        new->cl_addr = sin->sin_addr.s_addr;
@@ -857,11 +798,9 @@ out:
 
 
 /*
- * RFC 3010 has a complex implmentation description of processing a 
- * SETCLIENTID_CONFIRM request consisting of 4 bullets describing
- * processing on a DRC miss, labeled as CASE1 - CASE4 below.
- *
- * NOTE: callback information will be processed here in a future patch
+ * Section 14.2.34 of RFC 3530 (under the heading "IMPLEMENTATION") has
+ * a description of SETCLIENTID_CONFIRM request processing consisting of 4
+ * bullets, labeled as CASE1 - CASE4 below.
  */
 __be32
 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
@@ -892,16 +831,16 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
        if (unconf && unconf->cl_addr != sin->sin_addr.s_addr)
                goto out;
 
-       if ((conf && unconf) && 
-           (same_verf(&unconf->cl_confirm, &confirm)) &&
-           (same_verf(&conf->cl_verifier, &unconf->cl_verifier)) &&
-           (same_name(conf->cl_recdir,unconf->cl_recdir))  &&
-           (!same_verf(&conf->cl_confirm, &unconf->cl_confirm))) {
-               /* CASE 1:
-               * unconf record that matches input clientid and input confirm.
-               * conf record that matches input clientid.
-               * conf and unconf records match names, verifiers
-               */
+       /*
+        * section 14.2.34 of RFC 3530 has a description of
+        * SETCLIENTID_CONFIRM request processing consisting
+        * of 4 bullet points, labeled as CASE1 - CASE4 below.
+        */
+       if (conf && unconf && same_verf(&confirm, &unconf->cl_confirm)) {
+               /*
+                * RFC 3530 14.2.34 CASE 1:
+                * callback update
+                */
                if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
                        status = nfserr_clid_inuse;
                else {
@@ -914,15 +853,11 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        status = nfs_ok;
 
                }
-       } else if ((conf && !unconf) ||
-           ((conf && unconf) && 
-            (!same_verf(&conf->cl_verifier, &unconf->cl_verifier) ||
-             !same_name(conf->cl_recdir, unconf->cl_recdir)))) {
-               /* CASE 2:
-                * conf record that matches input clientid.
-                * if unconf record matches input clientid, then
-                * unconf->cl_name or unconf->cl_verifier don't match the
-                * conf record.
+       } else if (conf && !unconf) {
+               /*
+                * RFC 3530 14.2.34 CASE 2:
+                * probable retransmitted request; play it safe and
+                * do nothing.
                 */
                if (!same_creds(&conf->cl_cred, &rqstp->rq_cred))
                        status = nfserr_clid_inuse;
@@ -930,10 +865,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        status = nfs_ok;
        } else if (!conf && unconf
                        && same_verf(&unconf->cl_confirm, &confirm)) {
-               /* CASE 3:
-                * conf record not found.
-                * unconf record found.
-                * unconf->cl_confirm matches input confirm
+               /*
+                * RFC 3530 14.2.34 CASE 3:
+                * Normal case; new or rebooted client:
                 */
                if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) {
                        status = nfserr_clid_inuse;
@@ -948,16 +882,15 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        }
                        move_to_confirmed(unconf);
                        conf = unconf;
+                       nfsd4_probe_callback(conf);
                        status = nfs_ok;
                }
        } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
            && (!unconf || (unconf && !same_verf(&unconf->cl_confirm,
                                                                &confirm)))) {
-               /* CASE 4:
-                * conf record not found, or if conf, conf->cl_confirm does not
-                * match input confirm.
-                * unconf record not found, or if unconf, unconf->cl_confirm
-                * does not match input confirm.
+               /*
+                * RFC 3530 14.2.34 CASE 4:
+                * Client probably hasn't noticed that we rebooted yet.
                 */
                status = nfserr_stale_clientid;
        } else {
@@ -965,8 +898,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                status = nfserr_clid_inuse;
        }
 out:
-       if (!status)
-               nfsd4_probe_callback(conf);
        nfs4_unlock_state();
        return status;
 }
@@ -1226,14 +1157,19 @@ find_file(struct inode *ino)
        return NULL;
 }
 
-static int access_valid(u32 x)
+static inline int access_valid(u32 x)
 {
-       return (x > 0 && x < 4);
+       if (x < NFS4_SHARE_ACCESS_READ)
+               return 0;
+       if (x > NFS4_SHARE_ACCESS_BOTH)
+               return 0;
+       return 1;
 }
 
-static int deny_valid(u32 x)
+static inline int deny_valid(u32 x)
 {
-       return (x >= 0 && x < 5);
+       /* Note: unlike access bits, deny bits may be zero. */
+       return x <= NFS4_SHARE_DENY_BOTH;
 }
 
 static void
@@ -2162,8 +2098,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
                goto check_replay;
        }
 
+       *stpp = stp;
+       *sopp = sop = stp->st_stateowner;
+
        if (lock) {
-               struct nfs4_stateowner *sop = stp->st_stateowner;
                clientid_t *lockclid = &lock->v.new.clientid;
                struct nfs4_client *clp = sop->so_client;
                int lkflg = 0;
@@ -2193,9 +2131,6 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
                return nfserr_bad_stateid;
        }
 
-       *stpp = stp;
-       *sopp = sop = stp->st_stateowner;
-
        /*
        *  We now validate the seqid and stateid generation numbers.
        *  For the moment, we ignore the possibility of 
index 57333944af7fe5c63937472088c29b4725f006d2..b0592e7c378dbdf29878528bdfd8e3def4c744ad 100644 (file)
@@ -148,12 +148,12 @@ xdr_error:                                        \
        }                                       \
 } while (0)
 
-static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes)
+static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
 {
        /* We want more bytes than seem to be available.
         * Maybe we need a new page, maybe we have just run out
         */
-       int avail = (char*)argp->end - (char*)argp->p;
+       unsigned int avail = (char *)argp->end - (char *)argp->p;
        __be32 *p;
        if (avail + argp->pagelen < nbytes)
                return NULL;
@@ -169,6 +169,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes)
                        return NULL;
                
        }
+       /*
+        * The following memcpy is safe because read_buf is always
+        * called with nbytes > avail, and the two cases above both
+        * guarantee p points to at least nbytes bytes.
+        */
        memcpy(p, argp->p, avail);
        /* step to next page */
        argp->p = page_address(argp->pagelist[0]);
@@ -1448,7 +1453,7 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
 __be32
 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval,
-               struct svc_rqst *rqstp)
+               struct svc_rqst *rqstp, int ignore_crossmnt)
 {
        u32 bmval0 = bmval[0];
        u32 bmval1 = bmval[1];
@@ -1828,7 +1833,12 @@ out_acl:
        if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
                if ((buflen -= 8) < 0)
                        goto out_resource;
-               if (exp->ex_mnt->mnt_root->d_inode == dentry->d_inode) {
+               /*
+                * Get parent's attributes if not ignoring crossmount
+                * and this is the root of a cross-mounted filesystem.
+                */
+               if (ignore_crossmnt == 0 &&
+                   exp->ex_mnt->mnt_root->d_inode == dentry->d_inode) {
                        err = vfs_getattr(exp->ex_mnt->mnt_parent,
                                exp->ex_mnt->mnt_mountpoint, &stat);
                        if (err)
@@ -1864,13 +1874,25 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
        struct svc_export *exp = cd->rd_fhp->fh_export;
        struct dentry *dentry;
        __be32 nfserr;
+       int ignore_crossmnt = 0;
 
        dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
        if (IS_ERR(dentry))
                return nfserrno(PTR_ERR(dentry));
 
        exp_get(exp);
-       if (d_mountpoint(dentry)) {
+       /*
+        * In the case of a mountpoint, the client may be asking for
+        * attributes that are only properties of the underlying filesystem
+        * as opposed to the cross-mounted file system. In such a case,
+        * we will not follow the cross mount and will fill the attribtutes
+        * directly from the mountpoint dentry.
+        */
+       if (d_mountpoint(dentry) &&
+           (cd->rd_bmval[0] & ~FATTR4_WORD0_RDATTR_ERROR) == 0 &&
+           (cd->rd_bmval[1] & ~FATTR4_WORD1_MOUNTED_ON_FILEID) == 0)
+               ignore_crossmnt = 1;
+       else if (d_mountpoint(dentry)) {
                int err;
 
                /*
@@ -1889,7 +1911,7 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
 
        }
        nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval,
-                                       cd->rd_rqstp);
+                                       cd->rd_rqstp, ignore_crossmnt);
 out_put:
        dput(dentry);
        exp_put(exp);
@@ -2043,7 +2065,7 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
        buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2);
        nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry,
                                    resp->p, &buflen, getattr->ga_bmval,
-                                   resp->rqstp);
+                                   resp->rqstp, 0);
        if (!nfserr)
                resp->p += buflen;
        return nfserr;
index 578f2c9d56bec09899d4dc1c0ca135320e7c6bf9..5bfc2ac60d543a07a8d70d71057cf880d1bb995e 100644 (file)
@@ -44,17 +44,17 @@ static int  nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
  */
 static DEFINE_SPINLOCK(cache_lock);
 
-void
-nfsd_cache_init(void)
+int nfsd_reply_cache_init(void)
 {
        struct svc_cacherep     *rp;
        int                     i;
 
        INIT_LIST_HEAD(&lru_head);
        i = CACHESIZE;
-       while(i) {
+       while (i) {
                rp = kmalloc(sizeof(*rp), GFP_KERNEL);
-               if (!rp) break;
+               if (!rp)
+                       goto out_nomem;
                list_add(&rp->c_lru, &lru_head);
                rp->c_state = RC_UNUSED;
                rp->c_type = RC_NOCACHE;
@@ -62,23 +62,19 @@ nfsd_cache_init(void)
                i--;
        }
 
-       if (i)
-               printk (KERN_ERR "nfsd: cannot allocate all %d cache entries, only got %d\n",
-                       CACHESIZE, CACHESIZE-i);
-
        hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
-       if (!hash_list) {
-               nfsd_cache_shutdown();
-               printk (KERN_ERR "nfsd: cannot allocate %Zd bytes for hash list\n",
-                       HASHSIZE * sizeof(struct hlist_head));
-               return;
-       }
+       if (!hash_list)
+               goto out_nomem;
 
        cache_disabled = 0;
+       return 0;
+out_nomem:
+       printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
+       nfsd_reply_cache_shutdown();
+       return -ENOMEM;
 }
 
-void
-nfsd_cache_shutdown(void)
+void nfsd_reply_cache_shutdown(void)
 {
        struct svc_cacherep     *rp;
 
index 77dc9893b7bab462b65ebd751d9cc0c01273891f..8516137cdbb055ac87d1673c482126d2e670f488 100644 (file)
@@ -304,6 +304,9 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
        struct auth_domain *dom;
        struct knfsd_fh fh;
 
+       if (size == 0)
+               return -EINVAL;
+
        if (buf[size-1] != '\n')
                return -EINVAL;
        buf[size-1] = 0;
@@ -503,7 +506,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
                int len = 0;
                lock_kernel();
                if (nfsd_serv)
-                       len = svc_sock_names(buf, nfsd_serv, NULL);
+                       len = svc_xprt_names(nfsd_serv, buf, 0);
                unlock_kernel();
                return len;
        }
@@ -540,7 +543,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
                }
                return err < 0 ? err : 0;
        }
-       if (buf[0] == '-') {
+       if (buf[0] == '-' && isdigit(buf[1])) {
                char *toclose = kstrdup(buf+1, GFP_KERNEL);
                int len = 0;
                if (!toclose)
@@ -554,6 +557,53 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
                kfree(toclose);
                return len;
        }
+       /*
+        * Add a transport listener by writing it's transport name
+        */
+       if (isalpha(buf[0])) {
+               int err;
+               char transport[16];
+               int port;
+               if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
+                       err = nfsd_create_serv();
+                       if (!err) {
+                               err = svc_create_xprt(nfsd_serv,
+                                                     transport, port,
+                                                     SVC_SOCK_ANONYMOUS);
+                               if (err == -ENOENT)
+                                       /* Give a reasonable perror msg for
+                                        * bad transport string */
+                                       err = -EPROTONOSUPPORT;
+                       }
+                       return err < 0 ? err : 0;
+               }
+       }
+       /*
+        * Remove a transport by writing it's transport name and port number
+        */
+       if (buf[0] == '-' && isalpha(buf[1])) {
+               struct svc_xprt *xprt;
+               int err = -EINVAL;
+               char transport[16];
+               int port;
+               if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
+                       if (port == 0)
+                               return -EINVAL;
+                       lock_kernel();
+                       if (nfsd_serv) {
+                               xprt = svc_find_xprt(nfsd_serv, transport,
+                                                    AF_UNSPEC, port);
+                               if (xprt) {
+                                       svc_close_xprt(xprt);
+                                       svc_xprt_put(xprt);
+                                       err = 0;
+                               } else
+                                       err = -ENOTCONN;
+                       }
+                       unlock_kernel();
+                       return err < 0 ? err : 0;
+               }
+       }
        return -EINVAL;
 }
 
@@ -616,7 +666,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
        char *recdir;
        int len, status;
 
-       if (size > PATH_MAX || buf[size-1] != '\n')
+       if (size == 0 || size > PATH_MAX || buf[size-1] != '\n')
                return -EINVAL;
        buf[size-1] = 0;
 
@@ -674,6 +724,27 @@ static struct file_system_type nfsd_fs_type = {
        .kill_sb        = kill_litter_super,
 };
 
+#ifdef CONFIG_PROC_FS
+static int create_proc_exports_entry(void)
+{
+       struct proc_dir_entry *entry;
+
+       entry = proc_mkdir("fs/nfs", NULL);
+       if (!entry)
+               return -ENOMEM;
+       entry = create_proc_entry("fs/nfs/exports", 0, NULL);
+       if (!entry)
+               return -ENOMEM;
+       entry->proc_fops =  &exports_operations;
+       return 0;
+}
+#else /* CONFIG_PROC_FS */
+static int create_proc_exports_entry(void)
+{
+       return 0;
+}
+#endif
+
 static int __init init_nfsd(void)
 {
        int retval;
@@ -683,32 +754,43 @@ static int __init init_nfsd(void)
        if (retval)
                return retval;
        nfsd_stat_init();       /* Statistics */
-       nfsd_cache_init();      /* RPC reply cache */
-       nfsd_export_init();     /* Exports table */
+       retval = nfsd_reply_cache_init();
+       if (retval)
+               goto out_free_stat;
+       retval = nfsd_export_init();
+       if (retval)
+               goto out_free_cache;
        nfsd_lockd_init();      /* lockd->nfsd callbacks */
-       nfsd_idmap_init();      /* Name to ID mapping */
-       if (proc_mkdir("fs/nfs", NULL)) {
-               struct proc_dir_entry *entry;
-               entry = create_proc_entry("fs/nfs/exports", 0, NULL);
-               if (entry)
-                       entry->proc_fops =  &exports_operations;
-       }
+       retval = nfsd_idmap_init();
+       if (retval)
+               goto out_free_lockd;
+       retval = create_proc_exports_entry();
+       if (retval)
+               goto out_free_idmap;
        retval = register_filesystem(&nfsd_fs_type);
-       if (retval) {
-               nfsd_export_shutdown();
-               nfsd_cache_shutdown();
-               remove_proc_entry("fs/nfs/exports", NULL);
-               remove_proc_entry("fs/nfs", NULL);
-               nfsd_stat_shutdown();
-               nfsd_lockd_shutdown();
-       }
+       if (retval)
+               goto out_free_all;
+       return 0;
+out_free_all:
+       remove_proc_entry("fs/nfs/exports", NULL);
+       remove_proc_entry("fs/nfs", NULL);
+out_free_idmap:
+       nfsd_idmap_shutdown();
+out_free_lockd:
+       nfsd_lockd_shutdown();
+       nfsd_export_shutdown();
+out_free_cache:
+       nfsd_reply_cache_shutdown();
+out_free_stat:
+       nfsd_stat_shutdown();
+       nfsd4_free_slabs();
        return retval;
 }
 
 static void __exit exit_nfsd(void)
 {
        nfsd_export_shutdown();
-       nfsd_cache_shutdown();
+       nfsd_reply_cache_shutdown();
        remove_proc_entry("fs/nfs/exports", NULL);
        remove_proc_entry("fs/nfs", NULL);
        nfsd_stat_shutdown();
index 468f17a784416e96e11081d90230a8e6df4eec90..8fbd2dc08a92159a853ef748db561522ec0a0480 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcauth_gss.h>
 #include <linux/nfsd/nfsd.h>
+#include "auth.h"
 
 #define NFSDDBG_FACILITY               NFSDDBG_FH
 
index 1190aeaa92be2e4563c759d04fe96bbc46a0b393..9647b0f7bc0c0d52610bf1419c9bafcd466b57dc 100644 (file)
@@ -155,8 +155,8 @@ static int killsig; /* signal that was used to kill last nfsd */
 static void nfsd_last_thread(struct svc_serv *serv)
 {
        /* When last nfsd thread exits we need to do some clean-up */
-       struct svc_sock *svsk;
-       list_for_each_entry(svsk, &serv->sv_permsocks, sk_list)
+       struct svc_xprt *xprt;
+       list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list)
                lockd_down();
        nfsd_serv = NULL;
        nfsd_racache_shutdown();
@@ -236,7 +236,7 @@ static int nfsd_init_socks(int port)
 
        error = lockd_up(IPPROTO_UDP);
        if (error >= 0) {
-               error = svc_makesock(nfsd_serv, IPPROTO_UDP, port,
+               error = svc_create_xprt(nfsd_serv, "udp", port,
                                        SVC_SOCK_DEFAULTS);
                if (error < 0)
                        lockd_down();
@@ -247,7 +247,7 @@ static int nfsd_init_socks(int port)
 #ifdef CONFIG_NFSD_TCP
        error = lockd_up(IPPROTO_TCP);
        if (error >= 0) {
-               error = svc_makesock(nfsd_serv, IPPROTO_TCP, port,
+               error = svc_create_xprt(nfsd_serv, "tcp", port,
                                        SVC_SOCK_DEFAULTS);
                if (error < 0)
                        lockd_down();
index b86e3658a0af10ebce825e260ee1fc5fd879f367..61ad61743d9403524242e564523b4f3945aa25e3 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/xdr.h>
 #include <linux/mm.h>
+#include "auth.h"
 
 #define NFSDDBG_FACILITY               NFSDDBG_XDR
 
@@ -62,10 +63,10 @@ encode_fh(__be32 *p, struct svc_fh *fhp)
  * no slashes or null bytes.
  */
 static __be32 *
-decode_filename(__be32 *p, char **namp, int *lenp)
+decode_filename(__be32 *p, char **namp, unsigned int *lenp)
 {
        char            *name;
-       int             i;
+       unsigned int    i;
 
        if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) {
                for (i = 0, name = *namp; i < *lenp; i++, name++) {
@@ -78,10 +79,10 @@ decode_filename(__be32 *p, char **namp, int *lenp)
 }
 
 static __be32 *
-decode_pathname(__be32 *p, char **namp, int *lenp)
+decode_pathname(__be32 *p, char **namp, unsigned int *lenp)
 {
        char            *name;
-       int             i;
+       unsigned int    i;
 
        if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) {
                for (i = 0, name = *namp; i < *lenp; i++, name++) {
index d0199189924cee7d5ea49aafd52dcc81fab21df2..cc75e4fcd02baf2a989573ac5aac2e16dad8df17 100644 (file)
@@ -132,7 +132,7 @@ out:
 
 __be32
 nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
-                  const char *name, int len,
+                  const char *name, unsigned int len,
                   struct svc_export **exp_ret, struct dentry **dentry_ret)
 {
        struct svc_export       *exp;
@@ -226,7 +226,7 @@ out_nfserr:
  */
 __be32
 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
-                                       int len, struct svc_fh *resfh)
+                               unsigned int len, struct svc_fh *resfh)
 {
        struct svc_export       *exp;
        struct dentry           *dentry;
@@ -1151,6 +1151,26 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
 }
 #endif /* CONFIG_NFSD_V3 */
 
+__be32
+nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
+                       struct iattr *iap)
+{
+       /*
+        * Mode has already been set earlier in create:
+        */
+       iap->ia_valid &= ~ATTR_MODE;
+       /*
+        * Setting uid/gid works only for root.  Irix appears to
+        * send along the gid on create when it tries to implement
+        * setgid directories via NFS:
+        */
+       if (current->fsuid != 0)
+               iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
+       if (iap->ia_valid)
+               return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+       return 0;
+}
+
 /*
  * Create a file (regular, directory, device, fifo); UNIX sockets 
  * not yet implemented.
@@ -1167,6 +1187,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        struct dentry   *dentry, *dchild = NULL;
        struct inode    *dirp;
        __be32          err;
+       __be32          err2;
        int             host_err;
 
        err = nfserr_perm;
@@ -1257,16 +1278,9 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        }
 
 
-       /* Set file attributes. Mode has already been set and
-        * setting uid/gid works only for root. Irix appears to
-        * send along the gid when it tries to implement setgid
-        * directories via NFS.
-        */
-       if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
-               __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
-               if (err2)
-                       err = err2;
-       }
+       err2 = nfsd_create_setattr(rqstp, resfhp, iap);
+       if (err2)
+               err = err2;
        /*
         * Update the file handle to get the new inode info.
         */
@@ -1295,6 +1309,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        struct dentry   *dentry, *dchild = NULL;
        struct inode    *dirp;
        __be32          err;
+       __be32          err2;
        int             host_err;
        __u32           v_mtime=0, v_atime=0;
 
@@ -1399,16 +1414,10 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
                iap->ia_atime.tv_nsec = 0;
        }
 
-       /* Set file attributes.
-        * Irix appears to send along the gid when it tries to
-        * implement setgid directories via NFS. Clear out all that cruft.
-        */
  set_attr:
-       if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
-               __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
-               if (err2)
-                       err = err2;
-       }
+       err2 = nfsd_create_setattr(rqstp, resfhp, iap);
+       if (err2)
+               err = err2;
 
        /*
         * Update the filehandle to get the new inode info.
index e2d1ce36b36728084d0d77538dd00b7101d1a528..4babb2a129ac946decc0478dc6690a6818e3b0e4 100644 (file)
@@ -173,14 +173,17 @@ void                nlmclnt_next_cookie(struct nlm_cookie *);
 /*
  * Host cache
  */
-struct nlm_host * nlmclnt_lookup_host(const struct sockaddr_in *, int, int, const char *, int);
-struct nlm_host * nlmsvc_lookup_host(struct svc_rqst *, const char *, int);
+struct nlm_host  *nlmclnt_lookup_host(const struct sockaddr_in *, int, int,
+                                       const char *, unsigned int);
+struct nlm_host  *nlmsvc_lookup_host(struct svc_rqst *, const char *,
+                                       unsigned int);
 struct rpc_clnt * nlm_bind_host(struct nlm_host *);
 void             nlm_rebind_host(struct nlm_host *);
 struct nlm_host * nlm_get_host(struct nlm_host *);
 void             nlm_release_host(struct nlm_host *);
 void             nlm_shutdown_hosts(void);
-extern void      nlm_host_rebooted(const struct sockaddr_in *, const char *, int, u32);
+extern void      nlm_host_rebooted(const struct sockaddr_in *, const char *,
+                                       unsigned int, u32);
 void             nsm_release(struct nsm_handle *);
 
 
index 83a1f9f6237b5f539f52cda2b6b09eeb013dd97c..df18fa053bcd7d0a1e1f9665e57efa8929dafb87 100644 (file)
@@ -29,7 +29,7 @@ struct svc_rqst;
 /* Lock info passed via NLM */
 struct nlm_lock {
        char *                  caller;
-       int                     len;    /* length of "caller" */
+       unsigned int            len;    /* length of "caller" */
        struct nfs_fh           fh;
        struct xdr_netobj       oh;
        u32                     svid;
@@ -78,7 +78,7 @@ struct nlm_res {
  */
 struct nlm_reboot {
        char *          mon;
-       int             len;
+       unsigned int    len;
        u32             state;
        __be32          addr;
        __be32          vers;
index d9c5455808e590cd72057b18d22d239dc47e4b0a..e726fc3a4375f5948d395e6756e10bca0673d9b6 100644 (file)
@@ -4,4 +4,3 @@ unifdef-y += stats.h
 unifdef-y += syscall.h
 unifdef-y += nfsfh.h
 unifdef-y += debug.h
-unifdef-y += auth.h
index 007480cd6a601fbec62c5d5ac41ed50e2a2d882d..7b5d784cc8587cd80c8bd31125f56a793ebae253 100644 (file)
@@ -72,8 +72,8 @@ enum {
  */
 #define RC_DELAY               (HZ/5)
 
-void   nfsd_cache_init(void);
-void   nfsd_cache_shutdown(void);
+int    nfsd_reply_cache_init(void);
+void   nfsd_reply_cache_shutdown(void);
 int    nfsd_cache_lookup(struct svc_rqst *, int);
 void   nfsd_cache_update(struct svc_rqst *, int, __be32 *);
 
index bcb7abafbca9d041478122ecfeb26d8236a1f46d..3a1687251367bd83b18711303f1ae3a712dbdff0 100644 (file)
@@ -122,7 +122,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp);
 /*
  * Function declarations
  */
-void                   nfsd_export_init(void);
+int                    nfsd_export_init(void);
 void                   nfsd_export_shutdown(void);
 void                   nfsd_export_flush(void);
 void                   exp_readlock(void);
index 604a0d786bc6a2699f3968eed506c6ec001ebc0a..8caf4c4f64e68df30c686a396d60252e63915319 100644 (file)
@@ -20,7 +20,6 @@
 #include <linux/nfsd/debug.h>
 #include <linux/nfsd/nfsfh.h>
 #include <linux/nfsd/export.h>
-#include <linux/nfsd/auth.h>
 #include <linux/nfsd/stats.h>
 /*
  * nfsd version
@@ -70,9 +69,9 @@ void          nfsd_racache_shutdown(void);
 int            nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
                                struct svc_export **expp);
 __be32         nfsd_lookup(struct svc_rqst *, struct svc_fh *,
-                               const char *, int, struct svc_fh *);
+                               const char *, unsigned int, struct svc_fh *);
 __be32          nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
-                               const char *, int,
+                               const char *, unsigned int,
                                struct svc_export **, struct dentry **);
 __be32         nfsd_setattr(struct svc_rqst *, struct svc_fh *,
                                struct iattr *, int, time_t);
index 8bcddccb6c427e81a7a7b960d1f25afaa1bb9b6f..4e439765b705e3a01dd3be9559b57d38a70ea445 100644 (file)
@@ -18,7 +18,6 @@
 #include <linux/nfsd/const.h>
 #include <linux/nfsd/export.h>
 #include <linux/nfsd/nfsfh.h>
-#include <linux/nfsd/auth.h>
 
 /*
  * Version of the syscall interface
index 67885d5e6e50d2024b5d036f31a1fd5c3e5ab6ee..a0132ef58f2128c9fe25f1412f9959355db92d58 100644 (file)
@@ -23,7 +23,7 @@ struct nfsd_sattrargs {
 struct nfsd_diropargs {
        struct svc_fh           fh;
        char *                  name;
-       int                     len;
+       unsigned int            len;
 };
 
 struct nfsd_readargs {
@@ -43,17 +43,17 @@ struct nfsd_writeargs {
 struct nfsd_createargs {
        struct svc_fh           fh;
        char *                  name;
-       int                     len;
+       unsigned int            len;
        struct iattr            attrs;
 };
 
 struct nfsd_renameargs {
        struct svc_fh           ffh;
        char *                  fname;
-       int                     flen;
+       unsigned int            flen;
        struct svc_fh           tfh;
        char *                  tname;
-       int                     tlen;
+       unsigned int            tlen;
 };
 
 struct nfsd_readlinkargs {
@@ -65,15 +65,15 @@ struct nfsd_linkargs {
        struct svc_fh           ffh;
        struct svc_fh           tfh;
        char *                  tname;
-       int                     tlen;
+       unsigned int            tlen;
 };
 
 struct nfsd_symlinkargs {
        struct svc_fh           ffh;
        char *                  fname;
-       int                     flen;
+       unsigned int            flen;
        char *                  tname;
-       int                     tlen;
+       unsigned int            tlen;
        struct iattr            attrs;
 };
 
index 89d9d6061a62b5b8179380d4f38d346700490da8..421eddd65a25a7968b8abe94d4f041e4f9aa0319 100644 (file)
@@ -21,7 +21,7 @@ struct nfsd3_sattrargs {
 struct nfsd3_diropargs {
        struct svc_fh           fh;
        char *                  name;
-       int                     len;
+       unsigned int            len;
 };
 
 struct nfsd3_accessargs {
@@ -48,7 +48,7 @@ struct nfsd3_writeargs {
 struct nfsd3_createargs {
        struct svc_fh           fh;
        char *                  name;
-       int                     len;
+       unsigned int            len;
        int                     createmode;
        struct iattr            attrs;
        __be32 *                verf;
@@ -57,7 +57,7 @@ struct nfsd3_createargs {
 struct nfsd3_mknodargs {
        struct svc_fh           fh;
        char *                  name;
-       int                     len;
+       unsigned int            len;
        __u32                   ftype;
        __u32                   major, minor;
        struct iattr            attrs;
@@ -66,10 +66,10 @@ struct nfsd3_mknodargs {
 struct nfsd3_renameargs {
        struct svc_fh           ffh;
        char *                  fname;
-       int                     flen;
+       unsigned int            flen;
        struct svc_fh           tfh;
        char *                  tname;
-       int                     tlen;
+       unsigned int            tlen;
 };
 
 struct nfsd3_readlinkargs {
@@ -81,15 +81,15 @@ struct nfsd3_linkargs {
        struct svc_fh           ffh;
        struct svc_fh           tfh;
        char *                  tname;
-       int                     tlen;
+       unsigned int            tlen;
 };
 
 struct nfsd3_symlinkargs {
        struct svc_fh           ffh;
        char *                  fname;
-       int                     flen;
+       unsigned int            flen;
        char *                  tname;
-       int                     tlen;
+       unsigned int            tlen;
        struct iattr            attrs;
 };
 
index b0ddfb41c790754034a9cb5cfe650c6c00602339..27bd3e38ec5ad86b290a0f73feb73e973b917749 100644 (file)
@@ -441,7 +441,7 @@ void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
 void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
 __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                       struct dentry *dentry, __be32 *buffer, int *countp,
-                      u32 *bmval, struct svc_rqst *);
+                      u32 *bmval, struct svc_rqst *, int ignore_crossmnt);
 extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
                struct nfsd4_compound_state *,
                struct nfsd4_setclientid *setclid);
index e82746fcad14551ab7aaf46157e3aba04c27e6b1..d4a2ac18bd4cfba2a6713aa6a492660264b94782 100644 (file)
 #define IDMAP_NAMESZ 128
 
 #ifdef CONFIG_NFSD_V4
-void nfsd_idmap_init(void);
+int nfsd_idmap_init(void);
 void nfsd_idmap_shutdown(void);
 #else
-static inline void nfsd_idmap_init(void) {};
-static inline void nfsd_idmap_shutdown(void) {};
+static inline int nfsd_idmap_init(void)
+{
+       return 0;
+}
+static inline void nfsd_idmap_shutdown(void)
+{
+}
 #endif
 
 int nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
index bd7a6b0a87af654e080093037915238b6fc4fd74..03547d6abee50be5c73373dd14b6811700655b59 100644 (file)
@@ -169,8 +169,8 @@ extern int cache_check(struct cache_detail *detail,
 extern void cache_flush(void);
 extern void cache_purge(struct cache_detail *detail);
 #define NEVER (0x7FFFFFFF)
-extern void cache_register(struct cache_detail *cd);
-extern int cache_unregister(struct cache_detail *cd);
+extern int cache_register(struct cache_detail *cd);
+extern void cache_unregister(struct cache_detail *cd);
 
 extern void qword_add(char **bpp, int *lp, char *str);
 extern void qword_addhex(char **bpp, int *lp, char *buf, int blen);
index 3912cf16361ee98aa9334399fe6d8289e3270364..10709cbe96fdb88419a87b15bf32fc81fef7c06a 100644 (file)
@@ -20,7 +20,7 @@
 #define RPCDBG_BIND            0x0020
 #define RPCDBG_SCHED           0x0040
 #define RPCDBG_TRANS           0x0080
-#define RPCDBG_SVCSOCK         0x0100
+#define RPCDBG_SVCXPRT         0x0100
 #define RPCDBG_SVCDSP          0x0200
 #define RPCDBG_MISC            0x0400
 #define RPCDBG_CACHE           0x0800
index 8531a70da73d2526d7e110c92baa6fddb9152212..64c771056187a3957c70ce3c212e0db82db49ff9 100644 (file)
@@ -204,7 +204,7 @@ union svc_addr_u {
 struct svc_rqst {
        struct list_head        rq_list;        /* idle list */
        struct list_head        rq_all;         /* all threads list */
-       struct svc_sock *       rq_sock;        /* socket */
+       struct svc_xprt *       rq_xprt;        /* transport ptr */
        struct sockaddr_storage rq_addr;        /* peer address */
        size_t                  rq_addrlen;
 
@@ -214,9 +214,10 @@ struct svc_rqst {
        struct auth_ops *       rq_authop;      /* authentication flavour */
        u32                     rq_flavor;      /* pseudoflavor */
        struct svc_cred         rq_cred;        /* auth info */
-       struct sk_buff *        rq_skbuff;      /* fast recv inet buffer */
+       void *                  rq_xprt_ctxt;   /* transport specific context ptr */
        struct svc_deferred_req*rq_deferred;    /* deferred request we are replaying */
 
+       size_t                  rq_xprt_hlen;   /* xprt header len */
        struct xdr_buf          rq_arg;
        struct xdr_buf          rq_res;
        struct page *           rq_pages[RPCSVC_MAXPAGES];
@@ -317,11 +318,12 @@ static inline void svc_free_res_pages(struct svc_rqst *rqstp)
 
 struct svc_deferred_req {
        u32                     prot;   /* protocol (UDP or TCP) */
-       struct svc_sock         *svsk;
+       struct svc_xprt         *xprt;
        struct sockaddr_storage addr;   /* where reply must go */
        size_t                  addrlen;
        union svc_addr_u        daddr;  /* where reply must come from */
        struct cache_deferred_req handle;
+       size_t                  xprt_hlen;
        int                     argslen;
        __be32                  args[0];
 };
@@ -382,6 +384,8 @@ struct svc_procedure {
  */
 struct svc_serv *  svc_create(struct svc_program *, unsigned int,
                              void (*shutdown)(struct svc_serv*));
+struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
+                                       struct svc_pool *pool);
 int               svc_create_thread(svc_thread_fn, struct svc_serv *);
 void              svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
new file mode 100644 (file)
index 0000000..c11bbcc
--- /dev/null
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#ifndef SVC_RDMA_H
+#define SVC_RDMA_H
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#define SVCRDMA_DEBUG
+
+/* RPC/RDMA parameters and stats */
+extern unsigned int svcrdma_ord;
+extern unsigned int svcrdma_max_requests;
+extern unsigned int svcrdma_max_req_size;
+
+extern atomic_t rdma_stat_recv;
+extern atomic_t rdma_stat_read;
+extern atomic_t rdma_stat_write;
+extern atomic_t rdma_stat_sq_starve;
+extern atomic_t rdma_stat_rq_starve;
+extern atomic_t rdma_stat_rq_poll;
+extern atomic_t rdma_stat_rq_prod;
+extern atomic_t rdma_stat_sq_poll;
+extern atomic_t rdma_stat_sq_prod;
+
+#define RPCRDMA_VERSION 1
+
+/*
+ * Contexts are built when an RDMA request is created and are a
+ * record of the resources that can be recovered when the request
+ * completes.
+ */
+struct svc_rdma_op_ctxt {
+       struct svc_rdma_op_ctxt *next;
+       struct xdr_buf arg;
+       struct list_head dto_q;
+       enum ib_wr_opcode wr_op;
+       enum ib_wc_status wc_status;
+       u32 byte_len;
+       struct svcxprt_rdma *xprt;
+       unsigned long flags;
+       enum dma_data_direction direction;
+       int count;
+       struct ib_sge sge[RPCSVC_MAXPAGES];
+       struct page *pages[RPCSVC_MAXPAGES];
+};
+
+#define RDMACTXT_F_READ_DONE   1
+#define RDMACTXT_F_LAST_CTXT   2
+
+struct svcxprt_rdma {
+       struct svc_xprt      sc_xprt;           /* SVC transport structure */
+       struct rdma_cm_id    *sc_cm_id;         /* RDMA connection id */
+       struct list_head     sc_accept_q;       /* Conn. waiting accept */
+       int                  sc_ord;            /* RDMA read limit */
+       wait_queue_head_t    sc_read_wait;
+       int                  sc_max_sge;
+
+       int                  sc_sq_depth;       /* Depth of SQ */
+       atomic_t             sc_sq_count;       /* Number of SQ WR on queue */
+
+       int                  sc_max_requests;   /* Depth of RQ */
+       int                  sc_max_req_size;   /* Size of each RQ WR buf */
+
+       struct ib_pd         *sc_pd;
+
+       struct svc_rdma_op_ctxt  *sc_ctxt_head;
+       int                  sc_ctxt_cnt;
+       int                  sc_ctxt_bump;
+       int                  sc_ctxt_max;
+       spinlock_t           sc_ctxt_lock;
+       struct list_head     sc_rq_dto_q;
+       spinlock_t           sc_rq_dto_lock;
+       struct ib_qp         *sc_qp;
+       struct ib_cq         *sc_rq_cq;
+       struct ib_cq         *sc_sq_cq;
+       struct ib_mr         *sc_phys_mr;       /* MR for server memory */
+
+       spinlock_t           sc_lock;           /* transport lock */
+
+       wait_queue_head_t    sc_send_wait;      /* SQ exhaustion waitlist */
+       unsigned long        sc_flags;
+       struct list_head     sc_dto_q;          /* DTO tasklet I/O pending Q */
+       struct list_head     sc_read_complete_q;
+       spinlock_t           sc_read_complete_lock;
+};
+/* sc_flags */
+#define RDMAXPRT_RQ_PENDING    1
+#define RDMAXPRT_SQ_PENDING    2
+#define RDMAXPRT_CONN_PENDING  3
+
+#define RPCRDMA_LISTEN_BACKLOG  10
+/* The default ORD value is based on two outstanding full-size writes with a
+ * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ.  */
+#define RPCRDMA_ORD             (64/4)
+#define RPCRDMA_SQ_DEPTH_MULT   8
+#define RPCRDMA_MAX_THREADS     16
+#define RPCRDMA_MAX_REQUESTS    16
+#define RPCRDMA_MAX_REQ_SIZE    4096
+
+/* svc_rdma_marshal.c */
+extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *,
+                                     int *, int *);
+extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *);
+extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *);
+extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
+                                    struct rpcrdma_msg *,
+                                    enum rpcrdma_errcode, u32 *);
+extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int);
+extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
+extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
+                                           u32, u64, u32);
+extern void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *,
+                                            struct rpcrdma_msg *,
+                                            struct rpcrdma_msg *,
+                                            enum rpcrdma_proc);
+extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *);
+
+/* svc_rdma_recvfrom.c */
+extern int svc_rdma_recvfrom(struct svc_rqst *);
+
+/* svc_rdma_sendto.c */
+extern int svc_rdma_sendto(struct svc_rqst *);
+
+/* svc_rdma_transport.c */
+extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *);
+extern int svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *,
+                              enum rpcrdma_errcode);
+struct page *svc_rdma_get_page(void);
+extern int svc_rdma_post_recv(struct svcxprt_rdma *);
+extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
+extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
+extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
+extern void svc_sq_reap(struct svcxprt_rdma *);
+extern void svc_rq_reap(struct svcxprt_rdma *);
+extern struct svc_xprt_class svc_rdma_class;
+extern void svc_rdma_prep_reply_hdr(struct svc_rqst *);
+
+/* svc_rdma.c */
+extern int svc_rdma_init(void);
+extern void svc_rdma_cleanup(void);
+
+/*
+ * Returns the address of the first read chunk or <nul> if no read chunk is
+ * present
+ */
+static inline struct rpcrdma_read_chunk *
+svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp)
+{
+       struct rpcrdma_read_chunk *ch =
+               (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+
+       if (ch->rc_discrim == 0)
+               return NULL;
+
+       return ch;
+}
+
+/*
+ * Returns the address of the first read write array element or <nul> if no
+ * write array list is present
+ */
+static inline struct rpcrdma_write_array *
+svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp)
+{
+       if (rmsgp->rm_body.rm_chunks[0] != 0
+           || rmsgp->rm_body.rm_chunks[1] == 0)
+               return NULL;
+
+       return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1];
+}
+
+/*
+ * Returns the address of the first reply array element or <nul> if no
+ * reply array is present
+ */
+static inline struct rpcrdma_write_array *
+svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp)
+{
+       struct rpcrdma_read_chunk *rch;
+       struct rpcrdma_write_array *wr_ary;
+       struct rpcrdma_write_array *rp_ary;
+
+       /* XXX: Need to fix when reply list may occur with read-list and/or
+        * write list */
+       if (rmsgp->rm_body.rm_chunks[0] != 0 ||
+           rmsgp->rm_body.rm_chunks[1] != 0)
+               return NULL;
+
+       rch = svc_rdma_get_read_chunk(rmsgp);
+       if (rch) {
+               while (rch->rc_discrim)
+                       rch++;
+
+               /* The reply list follows an empty write array located
+                * at 'rc_position' here. The reply array is at rc_target.
+                */
+               rp_ary = (struct rpcrdma_write_array *)&rch->rc_target;
+
+               goto found_it;
+       }
+
+       wr_ary = svc_rdma_get_write_array(rmsgp);
+       if (wr_ary) {
+               rp_ary = (struct rpcrdma_write_array *)
+                       &wr_ary->
+                       wc_array[wr_ary->wc_nchunks].wc_target.rs_length;
+
+               goto found_it;
+       }
+
+       /* No read list, no write list */
+       rp_ary = (struct rpcrdma_write_array *)
+               &rmsgp->rm_body.rm_chunks[2];
+
+ found_it:
+       if (rp_ary->wc_discrim == 0)
+               return NULL;
+
+       return rp_ary;
+}
+#endif
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
new file mode 100644 (file)
index 0000000..6fd7b01
--- /dev/null
@@ -0,0 +1,159 @@
+/*
+ * linux/include/linux/sunrpc/svc_xprt.h
+ *
+ * RPC server transport I/O
+ */
+
+#ifndef SUNRPC_SVC_XPRT_H
+#define SUNRPC_SVC_XPRT_H
+
+#include <linux/sunrpc/svc.h>
+#include <linux/module.h>
+
+struct svc_xprt_ops {
+       struct svc_xprt *(*xpo_create)(struct svc_serv *,
+                                      struct sockaddr *, int,
+                                      int);
+       struct svc_xprt *(*xpo_accept)(struct svc_xprt *);
+       int             (*xpo_has_wspace)(struct svc_xprt *);
+       int             (*xpo_recvfrom)(struct svc_rqst *);
+       void            (*xpo_prep_reply_hdr)(struct svc_rqst *);
+       int             (*xpo_sendto)(struct svc_rqst *);
+       void            (*xpo_release_rqst)(struct svc_rqst *);
+       void            (*xpo_detach)(struct svc_xprt *);
+       void            (*xpo_free)(struct svc_xprt *);
+};
+
+struct svc_xprt_class {
+       const char              *xcl_name;
+       struct module           *xcl_owner;
+       struct svc_xprt_ops     *xcl_ops;
+       struct list_head        xcl_list;
+       u32                     xcl_max_payload;
+};
+
+struct svc_xprt {
+       struct svc_xprt_class   *xpt_class;
+       struct svc_xprt_ops     *xpt_ops;
+       struct kref             xpt_ref;
+       struct list_head        xpt_list;
+       struct list_head        xpt_ready;
+       unsigned long           xpt_flags;
+#define        XPT_BUSY        0               /* enqueued/receiving */
+#define        XPT_CONN        1               /* conn pending */
+#define        XPT_CLOSE       2               /* dead or dying */
+#define        XPT_DATA        3               /* data pending */
+#define        XPT_TEMP        4               /* connected transport */
+#define        XPT_DEAD        6               /* transport closed */
+#define        XPT_CHNGBUF     7               /* need to change snd/rcv buf sizes */
+#define        XPT_DEFERRED    8               /* deferred request pending */
+#define        XPT_OLD         9               /* used for xprt aging mark+sweep */
+#define        XPT_DETACHED    10              /* detached from tempsocks list */
+#define XPT_LISTENER   11              /* listening endpoint */
+#define XPT_CACHE_AUTH 12              /* cache auth info */
+
+       struct svc_pool         *xpt_pool;      /* current pool iff queued */
+       struct svc_serv         *xpt_server;    /* service for transport */
+       atomic_t                xpt_reserved;   /* space on outq that is rsvd */
+       struct mutex            xpt_mutex;      /* to serialize sending data */
+       spinlock_t              xpt_lock;       /* protects sk_deferred
+                                                * and xpt_auth_cache */
+       void                    *xpt_auth_cache;/* auth cache */
+       struct list_head        xpt_deferred;   /* deferred requests that need
+                                                * to be revisted */
+       struct sockaddr_storage xpt_local;      /* local address */
+       size_t                  xpt_locallen;   /* length of address */
+       struct sockaddr_storage xpt_remote;     /* remote peer's address */
+       size_t                  xpt_remotelen;  /* length of address */
+};
+
+int    svc_reg_xprt_class(struct svc_xprt_class *);
+void   svc_unreg_xprt_class(struct svc_xprt_class *);
+void   svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *,
+                     struct svc_serv *);
+int    svc_create_xprt(struct svc_serv *, char *, unsigned short, int);
+void   svc_xprt_enqueue(struct svc_xprt *xprt);
+void   svc_xprt_received(struct svc_xprt *);
+void   svc_xprt_put(struct svc_xprt *xprt);
+void   svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt);
+void   svc_close_xprt(struct svc_xprt *xprt);
+void   svc_delete_xprt(struct svc_xprt *xprt);
+int    svc_port_is_privileged(struct sockaddr *sin);
+int    svc_print_xprts(char *buf, int maxlen);
+struct svc_xprt *svc_find_xprt(struct svc_serv *, char *, int, int);
+int    svc_xprt_names(struct svc_serv *serv, char *buf, int buflen);
+
+static inline void svc_xprt_get(struct svc_xprt *xprt)
+{
+       kref_get(&xprt->xpt_ref);
+}
+static inline void svc_xprt_set_local(struct svc_xprt *xprt,
+                                     struct sockaddr *sa, int salen)
+{
+       memcpy(&xprt->xpt_local, sa, salen);
+       xprt->xpt_locallen = salen;
+}
+static inline void svc_xprt_set_remote(struct svc_xprt *xprt,
+                                      struct sockaddr *sa, int salen)
+{
+       memcpy(&xprt->xpt_remote, sa, salen);
+       xprt->xpt_remotelen = salen;
+}
+static inline unsigned short svc_addr_port(struct sockaddr *sa)
+{
+       unsigned short ret = 0;
+       switch (sa->sa_family) {
+       case AF_INET:
+               ret = ntohs(((struct sockaddr_in *)sa)->sin_port);
+               break;
+       case AF_INET6:
+               ret = ntohs(((struct sockaddr_in6 *)sa)->sin6_port);
+               break;
+       }
+       return ret;
+}
+
+static inline size_t svc_addr_len(struct sockaddr *sa)
+{
+       switch (sa->sa_family) {
+       case AF_INET:
+               return sizeof(struct sockaddr_in);
+       case AF_INET6:
+               return sizeof(struct sockaddr_in6);
+       }
+       return -EAFNOSUPPORT;
+}
+
+static inline unsigned short svc_xprt_local_port(struct svc_xprt *xprt)
+{
+       return svc_addr_port((struct sockaddr *)&xprt->xpt_local);
+}
+
+static inline unsigned short svc_xprt_remote_port(struct svc_xprt *xprt)
+{
+       return svc_addr_port((struct sockaddr *)&xprt->xpt_remote);
+}
+
+static inline char *__svc_print_addr(struct sockaddr *addr,
+                                    char *buf, size_t len)
+{
+       switch (addr->sa_family) {
+       case AF_INET:
+               snprintf(buf, len, "%u.%u.%u.%u, port=%u",
+                       NIPQUAD(((struct sockaddr_in *) addr)->sin_addr),
+                       ntohs(((struct sockaddr_in *) addr)->sin_port));
+               break;
+
+       case AF_INET6:
+               snprintf(buf, len, "%x:%x:%x:%x:%x:%x:%x:%x, port=%u",
+                       NIP6(((struct sockaddr_in6 *) addr)->sin6_addr),
+                       ntohs(((struct sockaddr_in6 *) addr)->sin6_port));
+               break;
+
+       default:
+               snprintf(buf, len, "unknown address type: %d", addr->sa_family);
+               break;
+       }
+       return buf;
+}
+#endif /* SUNRPC_SVC_XPRT_H */
index a53e0fa855d2e26f9b2827f616a7f0f9964b00eb..206f092ad4c7dbd09a120352f538141ad9e58977 100644 (file)
 #define SUNRPC_SVCSOCK_H
 
 #include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svc_xprt.h>
 
 /*
  * RPC server socket.
  */
 struct svc_sock {
-       struct list_head        sk_ready;       /* list of ready sockets */
-       struct list_head        sk_list;        /* list of all sockets */
+       struct svc_xprt         sk_xprt;
        struct socket *         sk_sock;        /* berkeley socket layer */
        struct sock *           sk_sk;          /* INET layer */
 
-       struct svc_pool *       sk_pool;        /* current pool iff queued */
-       struct svc_serv *       sk_server;      /* service for this socket */
-       atomic_t                sk_inuse;       /* use count */
-       unsigned long           sk_flags;
-#define        SK_BUSY         0                       /* enqueued/receiving */
-#define        SK_CONN         1                       /* conn pending */
-#define        SK_CLOSE        2                       /* dead or dying */
-#define        SK_DATA         3                       /* data pending */
-#define        SK_TEMP         4                       /* temp (TCP) socket */
-#define        SK_DEAD         6                       /* socket closed */
-#define        SK_CHNGBUF      7                       /* need to change snd/rcv buffer sizes */
-#define        SK_DEFERRED     8                       /* request on sk_deferred */
-#define        SK_OLD          9                       /* used for temp socket aging mark+sweep */
-#define        SK_DETACHED     10                      /* detached from tempsocks list */
-
-       atomic_t                sk_reserved;    /* space on outq that is reserved */
-
-       spinlock_t              sk_lock;        /* protects sk_deferred and
-                                                * sk_info_authunix */
-       struct list_head        sk_deferred;    /* deferred requests that need to
-                                                * be revisted */
-       struct mutex            sk_mutex;       /* to serialize sending data */
-
-       int                     (*sk_recvfrom)(struct svc_rqst *rqstp);
-       int                     (*sk_sendto)(struct svc_rqst *rqstp);
-
        /* We keep the old state_change and data_ready CB's here */
        void                    (*sk_ostate)(struct sock *);
        void                    (*sk_odata)(struct sock *, int bytes);
@@ -54,21 +28,12 @@ struct svc_sock {
        /* private TCP part */
        int                     sk_reclen;      /* length of record */
        int                     sk_tcplen;      /* current read length */
-       time_t                  sk_lastrecv;    /* time of last received request */
-
-       /* cache of various info for TCP sockets */
-       void                    *sk_info_authunix;
-
-       struct sockaddr_storage sk_local;       /* local address */
-       struct sockaddr_storage sk_remote;      /* remote peer's address */
-       int                     sk_remotelen;   /* length of address */
 };
 
 /*
  * Function prototypes.
  */
-int            svc_makesock(struct svc_serv *, int, unsigned short, int flags);
-void           svc_force_close_socket(struct svc_sock *);
+void           svc_close_all(struct list_head *);
 int            svc_recv(struct svc_rqst *, long);
 int            svc_send(struct svc_rqst *);
 void           svc_drop(struct svc_rqst *);
@@ -78,6 +43,8 @@ int           svc_addsock(struct svc_serv *serv,
                            int fd,
                            char *name_return,
                            int *proto);
+void           svc_init_xprt_sock(void);
+void           svc_cleanup_xprt_sock(void);
 
 /*
  * svc_makesock socket characteristics
index 0751c9464d0f84ff7291c7ca332c65dbde383cee..e4057d729f036c8a3b80e3100a704e03ef0c6d4a 100644 (file)
@@ -112,7 +112,8 @@ struct xdr_buf {
 __be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int len);
 __be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int len);
 __be32 *xdr_encode_string(__be32 *p, const char *s);
-__be32 *xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen);
+__be32 *xdr_decode_string_inplace(__be32 *p, char **sp, unsigned int *lenp,
+                       unsigned int maxlen);
 __be32 *xdr_encode_netobj(__be32 *p, const struct xdr_netobj *);
 __be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *);
 
index 5c69a725e530b825b9cef83cbbd495b309c184b0..92e1dbe50947adb5d2612f1070362cc6f30d352e 100644 (file)
@@ -11,6 +11,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
            auth.o auth_null.o auth_unix.o \
            svc.o svcsock.o svcauth.o svcauth_unix.o \
            rpcb_clnt.o timer.o xdr.o \
-           sunrpc_syms.o cache.o rpc_pipe.o
+           sunrpc_syms.o cache.o rpc_pipe.o \
+           svc_xprt.o
 sunrpc-$(CONFIG_PROC_FS) += stats.o
 sunrpc-$(CONFIG_SYSCTL) += sysctl.o
index 73940df6c460c210d15e9f3c42c3a7b52a2694c7..481f984e9a22278bc9ef1993bc3064c77fc65a51 100644 (file)
@@ -224,38 +224,34 @@ static int rsi_parse(struct cache_detail *cd,
 
        /* major/minor */
        len = qword_get(&mesg, buf, mlen);
-       if (len < 0)
+       if (len <= 0)
                goto out;
-       if (len == 0) {
+       rsii.major_status = simple_strtoul(buf, &ep, 10);
+       if (*ep)
+               goto out;
+       len = qword_get(&mesg, buf, mlen);
+       if (len <= 0)
+               goto out;
+       rsii.minor_status = simple_strtoul(buf, &ep, 10);
+       if (*ep)
                goto out;
-       } else {
-               rsii.major_status = simple_strtoul(buf, &ep, 10);
-               if (*ep)
-                       goto out;
-               len = qword_get(&mesg, buf, mlen);
-               if (len <= 0)
-                       goto out;
-               rsii.minor_status = simple_strtoul(buf, &ep, 10);
-               if (*ep)
-                       goto out;
 
-               /* out_handle */
-               len = qword_get(&mesg, buf, mlen);
-               if (len < 0)
-                       goto out;
-               status = -ENOMEM;
-               if (dup_to_netobj(&rsii.out_handle, buf, len))
-                       goto out;
+       /* out_handle */
+       len = qword_get(&mesg, buf, mlen);
+       if (len < 0)
+               goto out;
+       status = -ENOMEM;
+       if (dup_to_netobj(&rsii.out_handle, buf, len))
+               goto out;
 
-               /* out_token */
-               len = qword_get(&mesg, buf, mlen);
-               status = -EINVAL;
-               if (len < 0)
-                       goto out;
-               status = -ENOMEM;
-               if (dup_to_netobj(&rsii.out_token, buf, len))
-                       goto out;
-       }
+       /* out_token */
+       len = qword_get(&mesg, buf, mlen);
+       status = -EINVAL;
+       if (len < 0)
+               goto out;
+       status = -ENOMEM;
+       if (dup_to_netobj(&rsii.out_token, buf, len))
+               goto out;
        rsii.h.expiry_time = expiry;
        rsip = rsi_update(&rsii, rsip);
        status = 0;
@@ -975,6 +971,7 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
        struct kvec *resv = &rqstp->rq_res.head[0];
        struct xdr_netobj tmpobj;
        struct rsi *rsip, rsikey;
+       int ret;
 
        /* Read the verifier; should be NULL: */
        *authp = rpc_autherr_badverf;
@@ -1014,23 +1011,27 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
                /* No upcall result: */
                return SVC_DROP;
        case 0:
+               ret = SVC_DROP;
                /* Got an answer to the upcall; use it: */
                if (gss_write_init_verf(rqstp, rsip))
-                       return SVC_DROP;
+                       goto out;
                if (resv->iov_len + 4 > PAGE_SIZE)
-                       return SVC_DROP;
+                       goto out;
                svc_putnl(resv, RPC_SUCCESS);
                if (svc_safe_putnetobj(resv, &rsip->out_handle))
-                       return SVC_DROP;
+                       goto out;
                if (resv->iov_len + 3 * 4 > PAGE_SIZE)
-                       return SVC_DROP;
+                       goto out;
                svc_putnl(resv, rsip->major_status);
                svc_putnl(resv, rsip->minor_status);
                svc_putnl(resv, GSS_SEQ_WIN);
                if (svc_safe_putnetobj(resv, &rsip->out_token))
-                       return SVC_DROP;
+                       goto out;
        }
-       return SVC_COMPLETE;
+       ret = SVC_COMPLETE;
+out:
+       cache_put(&rsip->h, &rsi_cache);
+       return ret;
 }
 
 /*
@@ -1125,6 +1126,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
        case RPC_GSS_PROC_DESTROY:
                if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
                        goto auth_err;
+               rsci->h.expiry_time = get_seconds();
                set_bit(CACHE_NEGATIVE, &rsci->h.flags);
                if (resv->iov_len + 4 > PAGE_SIZE)
                        goto drop;
@@ -1386,19 +1388,26 @@ int
 gss_svc_init(void)
 {
        int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss);
-       if (rv == 0) {
-               cache_register(&rsc_cache);
-               cache_register(&rsi_cache);
-       }
+       if (rv)
+               return rv;
+       rv = cache_register(&rsc_cache);
+       if (rv)
+               goto out1;
+       rv = cache_register(&rsi_cache);
+       if (rv)
+               goto out2;
+       return 0;
+out2:
+       cache_unregister(&rsc_cache);
+out1:
+       svc_auth_unregister(RPC_AUTH_GSS);
        return rv;
 }
 
 void
 gss_svc_shutdown(void)
 {
-       if (cache_unregister(&rsc_cache))
-               printk(KERN_ERR "auth_rpcgss: failed to unregister rsc cache\n");
-       if (cache_unregister(&rsi_cache))
-               printk(KERN_ERR "auth_rpcgss: failed to unregister rsi cache\n");
+       cache_unregister(&rsc_cache);
+       cache_unregister(&rsi_cache);
        svc_auth_unregister(RPC_AUTH_GSS);
 }
index 73f053d0cc7a4f7eee6d65954f5383cb6d6fa18e..636c8e04e0bebb0fff9b33e34a23bd8e40941dfc 100644 (file)
@@ -245,6 +245,7 @@ int cache_check(struct cache_detail *detail,
                cache_put(h, detail);
        return rv;
 }
+EXPORT_SYMBOL(cache_check);
 
 /*
  * caches need to be periodically cleaned.
@@ -290,44 +291,78 @@ static const struct file_operations cache_flush_operations;
 static void do_cache_clean(struct work_struct *work);
 static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean);
 
-void cache_register(struct cache_detail *cd)
+static void remove_cache_proc_entries(struct cache_detail *cd)
 {
-       cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc);
-       if (cd->proc_ent) {
-               struct proc_dir_entry *p;
-               cd->proc_ent->owner = cd->owner;
-               cd->channel_ent = cd->content_ent = NULL;
+       if (cd->proc_ent == NULL)
+               return;
+       if (cd->flush_ent)
+               remove_proc_entry("flush", cd->proc_ent);
+       if (cd->channel_ent)
+               remove_proc_entry("channel", cd->proc_ent);
+       if (cd->content_ent)
+               remove_proc_entry("content", cd->proc_ent);
+       cd->proc_ent = NULL;
+       remove_proc_entry(cd->name, proc_net_rpc);
+}
 
-               p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR,
-                                     cd->proc_ent);
-               cd->flush_ent =  p;
-               if (p) {
-                       p->proc_fops = &cache_flush_operations;
-                       p->owner = cd->owner;
-                       p->data = cd;
-               }
+#ifdef CONFIG_PROC_FS
+static int create_cache_proc_entries(struct cache_detail *cd)
+{
+       struct proc_dir_entry *p;
 
-               if (cd->cache_request || cd->cache_parse) {
-                       p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR,
-                                             cd->proc_ent);
-                       cd->channel_ent = p;
-                       if (p) {
-                               p->proc_fops = &cache_file_operations;
-                               p->owner = cd->owner;
-                               p->data = cd;
-                       }
-               }
-               if (cd->cache_show) {
-                       p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR,
-                                             cd->proc_ent);
-                       cd->content_ent = p;
-                       if (p) {
-                               p->proc_fops = &content_file_operations;
-                               p->owner = cd->owner;
-                               p->data = cd;
-                       }
-               }
+       cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc);
+       if (cd->proc_ent == NULL)
+               goto out_nomem;
+       cd->proc_ent->owner = cd->owner;
+       cd->channel_ent = cd->content_ent = NULL;
+
+       p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, cd->proc_ent);
+       cd->flush_ent = p;
+       if (p == NULL)
+               goto out_nomem;
+       p->proc_fops = &cache_flush_operations;
+       p->owner = cd->owner;
+       p->data = cd;
+
+       if (cd->cache_request || cd->cache_parse) {
+               p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR,
+                                     cd->proc_ent);
+               cd->channel_ent = p;
+               if (p == NULL)
+                       goto out_nomem;
+               p->proc_fops = &cache_file_operations;
+               p->owner = cd->owner;
+               p->data = cd;
        }
+       if (cd->cache_show) {
+               p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR,
+                                     cd->proc_ent);
+               cd->content_ent = p;
+               if (p == NULL)
+                       goto out_nomem;
+               p->proc_fops = &content_file_operations;
+               p->owner = cd->owner;
+               p->data = cd;
+       }
+       return 0;
+out_nomem:
+       remove_cache_proc_entries(cd);
+       return -ENOMEM;
+}
+#else /* CONFIG_PROC_FS */
+static int create_cache_proc_entries(struct cache_detail *cd)
+{
+       return 0;
+}
+#endif
+
+int cache_register(struct cache_detail *cd)
+{
+       int ret;
+
+       ret = create_cache_proc_entries(cd);
+       if (ret)
+               return ret;
        rwlock_init(&cd->hash_lock);
        INIT_LIST_HEAD(&cd->queue);
        spin_lock(&cache_list_lock);
@@ -341,9 +376,11 @@ void cache_register(struct cache_detail *cd)
 
        /* start the cleaning process */
        schedule_delayed_work(&cache_cleaner, 0);
+       return 0;
 }
+EXPORT_SYMBOL(cache_register);
 
-int cache_unregister(struct cache_detail *cd)
+void cache_unregister(struct cache_detail *cd)
 {
        cache_purge(cd);
        spin_lock(&cache_list_lock);
@@ -351,30 +388,23 @@ int cache_unregister(struct cache_detail *cd)
        if (cd->entries || atomic_read(&cd->inuse)) {
                write_unlock(&cd->hash_lock);
                spin_unlock(&cache_list_lock);
-               return -EBUSY;
+               goto out;
        }
        if (current_detail == cd)
                current_detail = NULL;
        list_del_init(&cd->others);
        write_unlock(&cd->hash_lock);
        spin_unlock(&cache_list_lock);
-       if (cd->proc_ent) {
-               if (cd->flush_ent)
-                       remove_proc_entry("flush", cd->proc_ent);
-               if (cd->channel_ent)
-                       remove_proc_entry("channel", cd->proc_ent);
-               if (cd->content_ent)
-                       remove_proc_entry("content", cd->proc_ent);
-
-               cd->proc_ent = NULL;
-               remove_proc_entry(cd->name, proc_net_rpc);
-       }
+       remove_cache_proc_entries(cd);
        if (list_empty(&cache_list)) {
                /* module must be being unloaded so its safe to kill the worker */
                cancel_delayed_work_sync(&cache_cleaner);
        }
-       return 0;
+       return;
+out:
+       printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name);
 }
+EXPORT_SYMBOL(cache_unregister);
 
 /* clean cache tries to find something to clean
  * and cleans it.
@@ -489,6 +519,7 @@ void cache_flush(void)
        while (cache_clean() != -1)
                cond_resched();
 }
+EXPORT_SYMBOL(cache_flush);
 
 void cache_purge(struct cache_detail *detail)
 {
@@ -497,7 +528,7 @@ void cache_purge(struct cache_detail *detail)
        cache_flush();
        detail->flush_time = 1;
 }
-
+EXPORT_SYMBOL(cache_purge);
 
 
 /*
@@ -634,13 +665,13 @@ void cache_clean_deferred(void *owner)
 /*
  * communicate with user-space
  *
- * We have a magic /proc file - /proc/sunrpc/cache
- * On read, you get a full request, or block
- * On write, an update request is processed
- * Poll works if anything to read, and always allows write
+ * We have a magic /proc file - /proc/sunrpc/<cachename>/channel.
+ * On read, you get a full request, or block.
+ * On write, an update request is processed.
+ * Poll works if anything to read, and always allows write.
  *
  * Implemented by linked list of requests.  Each open file has
- * a ->private that also exists in this list.  New request are added
+ * a ->private that also exists in this list.  New requests are added
  * to the end and may wakeup and preceding readers.
  * New readers are added to the head.  If, on read, an item is found with
  * CACHE_UPCALLING clear, we free it from the list.
@@ -963,6 +994,7 @@ void qword_add(char **bpp, int *lp, char *str)
        *bpp = bp;
        *lp = len;
 }
+EXPORT_SYMBOL(qword_add);
 
 void qword_addhex(char **bpp, int *lp, char *buf, int blen)
 {
@@ -991,6 +1023,7 @@ void qword_addhex(char **bpp, int *lp, char *buf, int blen)
        *bpp = bp;
        *lp = len;
 }
+EXPORT_SYMBOL(qword_addhex);
 
 static void warn_no_listener(struct cache_detail *detail)
 {
@@ -1113,6 +1146,7 @@ int qword_get(char **bpp, char *dest, int bufsize)
        *dest = '\0';
        return len;
 }
+EXPORT_SYMBOL(qword_get);
 
 
 /*
@@ -1244,18 +1278,18 @@ static ssize_t read_flush(struct file *file, char __user *buf,
        struct cache_detail *cd = PDE(file->f_path.dentry->d_inode)->data;
        char tbuf[20];
        unsigned long p = *ppos;
-       int len;
+       size_t len;
 
        sprintf(tbuf, "%lu\n", cd->flush_time);
        len = strlen(tbuf);
        if (p >= len)
                return 0;
        len -= p;
-       if (len > count) len = count;
+       if (len > count)
+               len = count;
        if (copy_to_user(buf, (void*)(tbuf+p), len))
-               len = -EFAULT;
-       else
-               *ppos += len;
+               return -EFAULT;
+       *ppos += len;
        return len;
 }
 
index 74df2d358e61ba5eb77a806a2cb1d58e163dd7de..5a16875f5ac8b0686ccdf5e9448dac850aab7064 100644 (file)
@@ -33,7 +33,7 @@ struct proc_dir_entry *proc_net_rpc = NULL;
 static int rpc_proc_show(struct seq_file *seq, void *v) {
        const struct rpc_stat   *statp = seq->private;
        const struct rpc_program *prog = statp->program;
-       int             i, j;
+       unsigned int i, j;
 
        seq_printf(seq,
                "net %u %u %u %u\n",
@@ -81,7 +81,7 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
        const struct svc_program *prog = statp->program;
        const struct svc_procedure *proc;
        const struct svc_version *vers;
-       int             i, j;
+       unsigned int i, j;
 
        seq_printf(seq,
                "net %u %u %u %u\n",
@@ -106,6 +106,7 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
                seq_putc(seq, '\n');
        }
 }
+EXPORT_SYMBOL(svc_seq_show);
 
 /**
  * rpc_alloc_iostats - allocate an rpc_iostats structure
@@ -255,12 +256,14 @@ svc_proc_register(struct svc_stat *statp, const struct file_operations *fops)
 {
        return do_register(statp->program->pg_name, statp, fops);
 }
+EXPORT_SYMBOL(svc_proc_register);
 
 void
 svc_proc_unregister(const char *name)
 {
        remove_proc_entry(name, proc_net_rpc);
 }
+EXPORT_SYMBOL(svc_proc_unregister);
 
 void
 rpc_proc_init(void)
index 1a7e309d008bcd9624dff48195c8052c515a3791..843629f557630d07b0b4b0e29eb5a6f9feddce88 100644 (file)
 #include <linux/sunrpc/rpc_pipe_fs.h>
 #include <linux/sunrpc/xprtsock.h>
 
-/* RPC server stuff */
-EXPORT_SYMBOL(svc_create);
-EXPORT_SYMBOL(svc_create_thread);
-EXPORT_SYMBOL(svc_create_pooled);
-EXPORT_SYMBOL(svc_set_num_threads);
-EXPORT_SYMBOL(svc_exit_thread);
-EXPORT_SYMBOL(svc_destroy);
-EXPORT_SYMBOL(svc_drop);
-EXPORT_SYMBOL(svc_process);
-EXPORT_SYMBOL(svc_recv);
-EXPORT_SYMBOL(svc_wake_up);
-EXPORT_SYMBOL(svc_makesock);
-EXPORT_SYMBOL(svc_reserve);
-EXPORT_SYMBOL(svc_auth_register);
-EXPORT_SYMBOL(auth_domain_lookup);
-EXPORT_SYMBOL(svc_authenticate);
-EXPORT_SYMBOL(svc_set_client);
-
-/* RPC statistics */
-#ifdef CONFIG_PROC_FS
-EXPORT_SYMBOL(svc_proc_register);
-EXPORT_SYMBOL(svc_proc_unregister);
-EXPORT_SYMBOL(svc_seq_show);
-#endif
-
-/* caching... */
-EXPORT_SYMBOL(auth_domain_find);
-EXPORT_SYMBOL(auth_domain_put);
-EXPORT_SYMBOL(auth_unix_add_addr);
-EXPORT_SYMBOL(auth_unix_forget_old);
-EXPORT_SYMBOL(auth_unix_lookup);
-EXPORT_SYMBOL(cache_check);
-EXPORT_SYMBOL(cache_flush);
-EXPORT_SYMBOL(cache_purge);
-EXPORT_SYMBOL(cache_register);
-EXPORT_SYMBOL(cache_unregister);
-EXPORT_SYMBOL(qword_add);
-EXPORT_SYMBOL(qword_addhex);
-EXPORT_SYMBOL(qword_get);
-EXPORT_SYMBOL(svcauth_unix_purge);
-EXPORT_SYMBOL(unix_domain_find);
-
 extern struct cache_detail ip_map_cache, unix_gid_cache;
 
 static int __init
@@ -85,7 +43,8 @@ init_sunrpc(void)
 #endif
        cache_register(&ip_map_cache);
        cache_register(&unix_gid_cache);
-       init_socket_xprt();
+       svc_init_xprt_sock();   /* svc sock transport */
+       init_socket_xprt();     /* clnt sock transport */
        rpcauth_init_module();
 out:
        return err;
@@ -96,12 +55,11 @@ cleanup_sunrpc(void)
 {
        rpcauth_remove_module();
        cleanup_socket_xprt();
+       svc_cleanup_xprt_sock();
        unregister_rpc_pipefs();
        rpc_destroy_mempool();
-       if (cache_unregister(&ip_map_cache))
-               printk(KERN_ERR "sunrpc: failed to unregister ip_map cache\n");
-       if (cache_unregister(&unix_gid_cache))
-             printk(KERN_ERR "sunrpc: failed to unregister unix_gid cache\n");
+       cache_unregister(&ip_map_cache);
+       cache_unregister(&unix_gid_cache);
 #ifdef RPC_DEBUG
        rpc_unregister_sysctl();
 #endif
index 4ad5fbbb18b48df0d2ad9a0b457212a6c764330d..a290e1523297783da4e491ad547e17cdeb5da675 100644 (file)
@@ -364,7 +364,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
           void (*shutdown)(struct svc_serv *serv))
 {
        struct svc_serv *serv;
-       int vers;
+       unsigned int vers;
        unsigned int xdrsize;
        unsigned int i;
 
@@ -433,6 +433,7 @@ svc_create(struct svc_program *prog, unsigned int bufsize,
 {
        return __svc_create(prog, bufsize, /*npools*/1, shutdown);
 }
+EXPORT_SYMBOL(svc_create);
 
 struct svc_serv *
 svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
@@ -452,6 +453,7 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
 
        return serv;
 }
+EXPORT_SYMBOL(svc_create_pooled);
 
 /*
  * Destroy an RPC service.  Should be called with the BKL held
@@ -459,9 +461,6 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
 void
 svc_destroy(struct svc_serv *serv)
 {
-       struct svc_sock *svsk;
-       struct svc_sock *tmp;
-
        dprintk("svc: svc_destroy(%s, %d)\n",
                                serv->sv_program->pg_name,
                                serv->sv_nrthreads);
@@ -476,14 +475,12 @@ svc_destroy(struct svc_serv *serv)
 
        del_timer_sync(&serv->sv_temptimer);
 
-       list_for_each_entry_safe(svsk, tmp, &serv->sv_tempsocks, sk_list)
-               svc_force_close_socket(svsk);
+       svc_close_all(&serv->sv_tempsocks);
 
        if (serv->sv_shutdown)
                serv->sv_shutdown(serv);
 
-       list_for_each_entry_safe(svsk, tmp, &serv->sv_permsocks, sk_list)
-               svc_force_close_socket(svsk);
+       svc_close_all(&serv->sv_permsocks);
 
        BUG_ON(!list_empty(&serv->sv_permsocks));
        BUG_ON(!list_empty(&serv->sv_tempsocks));
@@ -498,6 +495,7 @@ svc_destroy(struct svc_serv *serv)
        kfree(serv->sv_pools);
        kfree(serv);
 }
+EXPORT_SYMBOL(svc_destroy);
 
 /*
  * Allocate an RPC server's buffer space.
@@ -536,31 +534,17 @@ svc_release_buffer(struct svc_rqst *rqstp)
                        put_page(rqstp->rq_pages[i]);
 }
 
-/*
- * Create a thread in the given pool.  Caller must hold BKL.
- * On a NUMA or SMP machine, with a multi-pool serv, the thread
- * will be restricted to run on the cpus belonging to the pool.
- */
-static int
-__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
-                   struct svc_pool *pool)
+struct svc_rqst *
+svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool)
 {
        struct svc_rqst *rqstp;
-       int             error = -ENOMEM;
-       int             have_oldmask = 0;
-       cpumask_t       oldmask;
 
        rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
        if (!rqstp)
-               goto out;
+               goto out_enomem;
 
        init_waitqueue_head(&rqstp->rq_wait);
 
-       if (!(rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL))
-        || !(rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL))
-        || !svc_init_buffer(rqstp, serv->sv_max_mesg))
-               goto out_thread;
-
        serv->sv_nrthreads++;
        spin_lock_bh(&pool->sp_lock);
        pool->sp_nrthreads++;
@@ -569,6 +553,45 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
        rqstp->rq_server = serv;
        rqstp->rq_pool = pool;
 
+       rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
+       if (!rqstp->rq_argp)
+               goto out_thread;
+
+       rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
+       if (!rqstp->rq_resp)
+               goto out_thread;
+
+       if (!svc_init_buffer(rqstp, serv->sv_max_mesg))
+               goto out_thread;
+
+       return rqstp;
+out_thread:
+       svc_exit_thread(rqstp);
+out_enomem:
+       return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(svc_prepare_thread);
+
+/*
+ * Create a thread in the given pool.  Caller must hold BKL.
+ * On a NUMA or SMP machine, with a multi-pool serv, the thread
+ * will be restricted to run on the cpus belonging to the pool.
+ */
+static int
+__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
+                   struct svc_pool *pool)
+{
+       struct svc_rqst *rqstp;
+       int             error = -ENOMEM;
+       int             have_oldmask = 0;
+       cpumask_t       oldmask;
+
+       rqstp = svc_prepare_thread(serv, pool);
+       if (IS_ERR(rqstp)) {
+               error = PTR_ERR(rqstp);
+               goto out;
+       }
+
        if (serv->sv_nrpools > 1)
                have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
 
@@ -597,6 +620,7 @@ svc_create_thread(svc_thread_fn func, struct svc_serv *serv)
 {
        return __svc_create_thread(func, serv, &serv->sv_pools[0]);
 }
+EXPORT_SYMBOL(svc_create_thread);
 
 /*
  * Choose a pool in which to create a new thread, for svc_set_num_threads
@@ -700,6 +724,7 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 
        return error;
 }
+EXPORT_SYMBOL(svc_set_num_threads);
 
 /*
  * Called from a server thread as it's exiting.  Caller must hold BKL.
@@ -726,6 +751,7 @@ svc_exit_thread(struct svc_rqst *rqstp)
        if (serv)
                svc_destroy(serv);
 }
+EXPORT_SYMBOL(svc_exit_thread);
 
 /*
  * Register an RPC service with the local portmapper.
@@ -737,7 +763,8 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port)
 {
        struct svc_program      *progp;
        unsigned long           flags;
-       int                     i, error = 0, dummy;
+       unsigned int            i;
+       int                     error = 0, dummy;
 
        if (!port)
                clear_thread_flag(TIF_SIGPENDING);
@@ -840,9 +867,9 @@ svc_process(struct svc_rqst *rqstp)
        rqstp->rq_res.tail[0].iov_len = 0;
        /* Will be turned off only in gss privacy case: */
        rqstp->rq_splice_ok = 1;
-       /* tcp needs a space for the record length... */
-       if (rqstp->rq_prot == IPPROTO_TCP)
-               svc_putnl(resv, 0);
+
+       /* Setup reply header */
+       rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
 
        rqstp->rq_xid = svc_getu32(argv);
        svc_putu32(resv, rqstp->rq_xid);
@@ -1049,16 +1076,15 @@ err_bad:
        svc_putnl(resv, ntohl(rpc_stat));
        goto sendit;
 }
+EXPORT_SYMBOL(svc_process);
 
 /*
  * Return (transport-specific) limit on the rpc payload.
  */
 u32 svc_max_payload(const struct svc_rqst *rqstp)
 {
-       int max = RPCSVC_MAXPAYLOAD_TCP;
+       u32 max = rqstp->rq_xprt->xpt_class->xcl_max_payload;
 
-       if (rqstp->rq_sock->sk_sock->type == SOCK_DGRAM)
-               max = RPCSVC_MAXPAYLOAD_UDP;
        if (rqstp->rq_server->sv_max_payload < max)
                max = rqstp->rq_server->sv_max_payload;
        return max;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
new file mode 100644 (file)
index 0000000..ea377e0
--- /dev/null
@@ -0,0 +1,1055 @@
+/*
+ * linux/net/sunrpc/svc_xprt.c
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <linux/freezer.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/tcp_states.h>
+#include <linux/uaccess.h>
+#include <asm/ioctls.h>
+
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svc_xprt.h>
+
+#define RPCDBG_FACILITY        RPCDBG_SVCXPRT
+
+static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt);
+static int svc_deferred_recv(struct svc_rqst *rqstp);
+static struct cache_deferred_req *svc_defer(struct cache_req *req);
+static void svc_age_temp_xprts(unsigned long closure);
+
+/* apparently the "standard" is that clients close
+ * idle connections after 5 minutes, servers after
+ * 6 minutes
+ *   http://www.connectathon.org/talks96/nfstcp.pdf
+ */
+static int svc_conn_age_period = 6*60;
+
+/* List of registered transport classes */
+static DEFINE_SPINLOCK(svc_xprt_class_lock);
+static LIST_HEAD(svc_xprt_class_list);
+
+/* SMP locking strategy:
+ *
+ *     svc_pool->sp_lock protects most of the fields of that pool.
+ *     svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
+ *     when both need to be taken (rare), svc_serv->sv_lock is first.
+ *     BKL protects svc_serv->sv_nrthread.
+ *     svc_sock->sk_lock protects the svc_sock->sk_deferred list
+ *             and the ->sk_info_authunix cache.
+ *
+ *     The XPT_BUSY bit in xprt->xpt_flags prevents a transport being
+ *     enqueued multiply. During normal transport processing this bit
+ *     is set by svc_xprt_enqueue and cleared by svc_xprt_received.
+ *     Providers should not manipulate this bit directly.
+ *
+ *     Some flags can be set to certain values at any time
+ *     providing that certain rules are followed:
+ *
+ *     XPT_CONN, XPT_DATA:
+ *             - Can be set or cleared at any time.
+ *             - After a set, svc_xprt_enqueue must be called to enqueue
+ *               the transport for processing.
+ *             - After a clear, the transport must be read/accepted.
+ *               If this succeeds, it must be set again.
+ *     XPT_CLOSE:
+ *             - Can set at any time. It is never cleared.
+ *      XPT_DEAD:
+ *             - Can only be set while XPT_BUSY is held which ensures
+ *               that no other thread will be using the transport or will
+ *               try to set XPT_DEAD.
+ */
+
+int svc_reg_xprt_class(struct svc_xprt_class *xcl)
+{
+       struct svc_xprt_class *cl;
+       int res = -EEXIST;
+
+       dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name);
+
+       INIT_LIST_HEAD(&xcl->xcl_list);
+       spin_lock(&svc_xprt_class_lock);
+       /* Make sure there isn't already a class with the same name */
+       list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) {
+               if (strcmp(xcl->xcl_name, cl->xcl_name) == 0)
+                       goto out;
+       }
+       list_add_tail(&xcl->xcl_list, &svc_xprt_class_list);
+       res = 0;
+out:
+       spin_unlock(&svc_xprt_class_lock);
+       return res;
+}
+EXPORT_SYMBOL_GPL(svc_reg_xprt_class);
+
+void svc_unreg_xprt_class(struct svc_xprt_class *xcl)
+{
+       dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name);
+       spin_lock(&svc_xprt_class_lock);
+       list_del_init(&xcl->xcl_list);
+       spin_unlock(&svc_xprt_class_lock);
+}
+EXPORT_SYMBOL_GPL(svc_unreg_xprt_class);
+
+/*
+ * Format the transport list for printing
+ */
+int svc_print_xprts(char *buf, int maxlen)
+{
+       struct list_head *le;
+       char tmpstr[80];
+       int len = 0;
+       buf[0] = '\0';
+
+       spin_lock(&svc_xprt_class_lock);
+       list_for_each(le, &svc_xprt_class_list) {
+               int slen;
+               struct svc_xprt_class *xcl =
+                       list_entry(le, struct svc_xprt_class, xcl_list);
+
+               sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload);
+               slen = strlen(tmpstr);
+               if (len + slen > maxlen)
+                       break;
+               len += slen;
+               strcat(buf, tmpstr);
+       }
+       spin_unlock(&svc_xprt_class_lock);
+
+       return len;
+}
+
+static void svc_xprt_free(struct kref *kref)
+{
+       struct svc_xprt *xprt =
+               container_of(kref, struct svc_xprt, xpt_ref);
+       struct module *owner = xprt->xpt_class->xcl_owner;
+       if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)
+           && xprt->xpt_auth_cache != NULL)
+               svcauth_unix_info_release(xprt->xpt_auth_cache);
+       xprt->xpt_ops->xpo_free(xprt);
+       module_put(owner);
+}
+
+void svc_xprt_put(struct svc_xprt *xprt)
+{
+       kref_put(&xprt->xpt_ref, svc_xprt_free);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_put);
+
+/*
+ * Called by transport drivers to initialize the transport independent
+ * portion of the transport instance.
+ */
+void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt,
+                  struct svc_serv *serv)
+{
+       memset(xprt, 0, sizeof(*xprt));
+       xprt->xpt_class = xcl;
+       xprt->xpt_ops = xcl->xcl_ops;
+       kref_init(&xprt->xpt_ref);
+       xprt->xpt_server = serv;
+       INIT_LIST_HEAD(&xprt->xpt_list);
+       INIT_LIST_HEAD(&xprt->xpt_ready);
+       INIT_LIST_HEAD(&xprt->xpt_deferred);
+       mutex_init(&xprt->xpt_mutex);
+       spin_lock_init(&xprt->xpt_lock);
+       set_bit(XPT_BUSY, &xprt->xpt_flags);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_init);
+
+int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
+                   int flags)
+{
+       struct svc_xprt_class *xcl;
+       struct sockaddr_in sin = {
+               .sin_family             = AF_INET,
+               .sin_addr.s_addr        = INADDR_ANY,
+               .sin_port               = htons(port),
+       };
+       dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
+       spin_lock(&svc_xprt_class_lock);
+       list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
+               struct svc_xprt *newxprt;
+
+               if (strcmp(xprt_name, xcl->xcl_name))
+                       continue;
+
+               if (!try_module_get(xcl->xcl_owner))
+                       goto err;
+
+               spin_unlock(&svc_xprt_class_lock);
+               newxprt = xcl->xcl_ops->
+                       xpo_create(serv, (struct sockaddr *)&sin, sizeof(sin),
+                                  flags);
+               if (IS_ERR(newxprt)) {
+                       module_put(xcl->xcl_owner);
+                       return PTR_ERR(newxprt);
+               }
+
+               clear_bit(XPT_TEMP, &newxprt->xpt_flags);
+               spin_lock_bh(&serv->sv_lock);
+               list_add(&newxprt->xpt_list, &serv->sv_permsocks);
+               spin_unlock_bh(&serv->sv_lock);
+               clear_bit(XPT_BUSY, &newxprt->xpt_flags);
+               return svc_xprt_local_port(newxprt);
+       }
+ err:
+       spin_unlock(&svc_xprt_class_lock);
+       dprintk("svc: transport %s not found\n", xprt_name);
+       return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(svc_create_xprt);
+
+/*
+ * Copy the local and remote xprt addresses to the rqstp structure
+ */
+void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt)
+{
+       struct sockaddr *sin;
+
+       memcpy(&rqstp->rq_addr, &xprt->xpt_remote, xprt->xpt_remotelen);
+       rqstp->rq_addrlen = xprt->xpt_remotelen;
+
+       /*
+        * Destination address in request is needed for binding the
+        * source address in RPC replies/callbacks later.
+        */
+       sin = (struct sockaddr *)&xprt->xpt_local;
+       switch (sin->sa_family) {
+       case AF_INET:
+               rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr;
+               break;
+       case AF_INET6:
+               rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr;
+               break;
+       }
+}
+EXPORT_SYMBOL_GPL(svc_xprt_copy_addrs);
+
+/**
+ * svc_print_addr - Format rq_addr field for printing
+ * @rqstp: svc_rqst struct containing address to print
+ * @buf: target buffer for formatted address
+ * @len: length of target buffer
+ *
+ */
+char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
+{
+       return __svc_print_addr(svc_addr(rqstp), buf, len);
+}
+EXPORT_SYMBOL_GPL(svc_print_addr);
+
+/*
+ * Queue up an idle server thread.  Must have pool->sp_lock held.
+ * Note: this is really a stack rather than a queue, so that we only
+ * use as many different threads as we need, and the rest don't pollute
+ * the cache.
+ */
+static void svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
+{
+       list_add(&rqstp->rq_list, &pool->sp_threads);
+}
+
+/*
+ * Dequeue an nfsd thread.  Must have pool->sp_lock held.
+ */
+static void svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
+{
+       list_del(&rqstp->rq_list);
+}
+
+/*
+ * Queue up a transport with data pending. If there are idle nfsd
+ * processes, wake 'em up.
+ *
+ */
+void svc_xprt_enqueue(struct svc_xprt *xprt)
+{
+       struct svc_serv *serv = xprt->xpt_server;
+       struct svc_pool *pool;
+       struct svc_rqst *rqstp;
+       int cpu;
+
+       if (!(xprt->xpt_flags &
+             ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED))))
+               return;
+       if (test_bit(XPT_DEAD, &xprt->xpt_flags))
+               return;
+
+       cpu = get_cpu();
+       pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
+       put_cpu();
+
+       spin_lock_bh(&pool->sp_lock);
+
+       if (!list_empty(&pool->sp_threads) &&
+           !list_empty(&pool->sp_sockets))
+               printk(KERN_ERR
+                      "svc_xprt_enqueue: "
+                      "threads and transports both waiting??\n");
+
+       if (test_bit(XPT_DEAD, &xprt->xpt_flags)) {
+               /* Don't enqueue dead transports */
+               dprintk("svc: transport %p is dead, not enqueued\n", xprt);
+               goto out_unlock;
+       }
+
+       /* Mark transport as busy. It will remain in this state until
+        * the provider calls svc_xprt_received. We update XPT_BUSY
+        * atomically because it also guards against trying to enqueue
+        * the transport twice.
+        */
+       if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) {
+               /* Don't enqueue transport while already enqueued */
+               dprintk("svc: transport %p busy, not enqueued\n", xprt);
+               goto out_unlock;
+       }
+       BUG_ON(xprt->xpt_pool != NULL);
+       xprt->xpt_pool = pool;
+
+       /* Handle pending connection */
+       if (test_bit(XPT_CONN, &xprt->xpt_flags))
+               goto process;
+
+       /* Handle close in-progress */
+       if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
+               goto process;
+
+       /* Check if we have space to reply to a request */
+       if (!xprt->xpt_ops->xpo_has_wspace(xprt)) {
+               /* Don't enqueue while not enough space for reply */
+               dprintk("svc: no write space, transport %p  not enqueued\n",
+                       xprt);
+               xprt->xpt_pool = NULL;
+               clear_bit(XPT_BUSY, &xprt->xpt_flags);
+               goto out_unlock;
+       }
+
+ process:
+       if (!list_empty(&pool->sp_threads)) {
+               rqstp = list_entry(pool->sp_threads.next,
+                                  struct svc_rqst,
+                                  rq_list);
+               dprintk("svc: transport %p served by daemon %p\n",
+                       xprt, rqstp);
+               svc_thread_dequeue(pool, rqstp);
+               if (rqstp->rq_xprt)
+                       printk(KERN_ERR
+                               "svc_xprt_enqueue: server %p, rq_xprt=%p!\n",
+                               rqstp, rqstp->rq_xprt);
+               rqstp->rq_xprt = xprt;
+               svc_xprt_get(xprt);
+               rqstp->rq_reserved = serv->sv_max_mesg;
+               atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
+               BUG_ON(xprt->xpt_pool != pool);
+               wake_up(&rqstp->rq_wait);
+       } else {
+               dprintk("svc: transport %p put into queue\n", xprt);
+               list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
+               BUG_ON(xprt->xpt_pool != pool);
+       }
+
+out_unlock:
+       spin_unlock_bh(&pool->sp_lock);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_enqueue);
+
+/*
+ * Dequeue the first transport.  Must be called with the pool->sp_lock held.
+ */
+static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
+{
+       struct svc_xprt *xprt;
+
+       if (list_empty(&pool->sp_sockets))
+               return NULL;
+
+       xprt = list_entry(pool->sp_sockets.next,
+                         struct svc_xprt, xpt_ready);
+       list_del_init(&xprt->xpt_ready);
+
+       dprintk("svc: transport %p dequeued, inuse=%d\n",
+               xprt, atomic_read(&xprt->xpt_ref.refcount));
+
+       return xprt;
+}
+
+/*
+ * svc_xprt_received conditionally queues the transport for processing
+ * by another thread. The caller must hold the XPT_BUSY bit and must
+ * not thereafter touch transport data.
+ *
+ * Note: XPT_DATA only gets cleared when a read-attempt finds no (or
+ * insufficient) data.
+ */
+void svc_xprt_received(struct svc_xprt *xprt)
+{
+       BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags));
+       xprt->xpt_pool = NULL;
+       clear_bit(XPT_BUSY, &xprt->xpt_flags);
+       svc_xprt_enqueue(xprt);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_received);
+
+/**
+ * svc_reserve - change the space reserved for the reply to a request.
+ * @rqstp:  The request in question
+ * @space: new max space to reserve
+ *
+ * Each request reserves some space on the output queue of the transport
+ * to make sure the reply fits.  This function reduces that reserved
+ * space to be the amount of space used already, plus @space.
+ *
+ */
+void svc_reserve(struct svc_rqst *rqstp, int space)
+{
+       space += rqstp->rq_res.head[0].iov_len;
+
+       if (space < rqstp->rq_reserved) {
+               struct svc_xprt *xprt = rqstp->rq_xprt;
+               atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
+               rqstp->rq_reserved = space;
+
+               svc_xprt_enqueue(xprt);
+       }
+}
+EXPORT_SYMBOL(svc_reserve);
+
+static void svc_xprt_release(struct svc_rqst *rqstp)
+{
+       struct svc_xprt *xprt = rqstp->rq_xprt;
+
+       rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
+
+       svc_free_res_pages(rqstp);
+       rqstp->rq_res.page_len = 0;
+       rqstp->rq_res.page_base = 0;
+
+       /* Reset response buffer and release
+        * the reservation.
+        * But first, check that enough space was reserved
+        * for the reply, otherwise we have a bug!
+        */
+       if ((rqstp->rq_res.len) >  rqstp->rq_reserved)
+               printk(KERN_ERR "RPC request reserved %d but used %d\n",
+                      rqstp->rq_reserved,
+                      rqstp->rq_res.len);
+
+       rqstp->rq_res.head[0].iov_len = 0;
+       svc_reserve(rqstp, 0);
+       rqstp->rq_xprt = NULL;
+
+       svc_xprt_put(xprt);
+}
+
+/*
+ * External function to wake up a server waiting for data
+ * This really only makes sense for services like lockd
+ * which have exactly one thread anyway.
+ */
+void svc_wake_up(struct svc_serv *serv)
+{
+       struct svc_rqst *rqstp;
+       unsigned int i;
+       struct svc_pool *pool;
+
+       for (i = 0; i < serv->sv_nrpools; i++) {
+               pool = &serv->sv_pools[i];
+
+               spin_lock_bh(&pool->sp_lock);
+               if (!list_empty(&pool->sp_threads)) {
+                       rqstp = list_entry(pool->sp_threads.next,
+                                          struct svc_rqst,
+                                          rq_list);
+                       dprintk("svc: daemon %p woken up.\n", rqstp);
+                       /*
+                       svc_thread_dequeue(pool, rqstp);
+                       rqstp->rq_xprt = NULL;
+                        */
+                       wake_up(&rqstp->rq_wait);
+               }
+               spin_unlock_bh(&pool->sp_lock);
+       }
+}
+EXPORT_SYMBOL(svc_wake_up);
+
+int svc_port_is_privileged(struct sockaddr *sin)
+{
+       switch (sin->sa_family) {
+       case AF_INET:
+               return ntohs(((struct sockaddr_in *)sin)->sin_port)
+                       < PROT_SOCK;
+       case AF_INET6:
+               return ntohs(((struct sockaddr_in6 *)sin)->sin6_port)
+                       < PROT_SOCK;
+       default:
+               return 0;
+       }
+}
+
+/*
+ * Make sure that we don't have too many active connections.  If we
+ * have, something must be dropped.
+ *
+ * There's no point in trying to do random drop here for DoS
+ * prevention. The NFS clients does 1 reconnect in 15 seconds. An
+ * attacker can easily beat that.
+ *
+ * The only somewhat efficient mechanism would be if drop old
+ * connections from the same IP first. But right now we don't even
+ * record the client IP in svc_sock.
+ */
+static void svc_check_conn_limits(struct svc_serv *serv)
+{
+       if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
+               struct svc_xprt *xprt = NULL;
+               spin_lock_bh(&serv->sv_lock);
+               if (!list_empty(&serv->sv_tempsocks)) {
+                       if (net_ratelimit()) {
+                               /* Try to help the admin */
+                               printk(KERN_NOTICE "%s: too many open  "
+                                      "connections, consider increasing the "
+                                      "number of nfsd threads\n",
+                                      serv->sv_name);
+                       }
+                       /*
+                        * Always select the oldest connection. It's not fair,
+                        * but so is life
+                        */
+                       xprt = list_entry(serv->sv_tempsocks.prev,
+                                         struct svc_xprt,
+                                         xpt_list);
+                       set_bit(XPT_CLOSE, &xprt->xpt_flags);
+                       svc_xprt_get(xprt);
+               }
+               spin_unlock_bh(&serv->sv_lock);
+
+               if (xprt) {
+                       svc_xprt_enqueue(xprt);
+                       svc_xprt_put(xprt);
+               }
+       }
+}
+
+/*
+ * Receive the next request on any transport.  This code is carefully
+ * organised not to touch any cachelines in the shared svc_serv
+ * structure, only cachelines in the local svc_pool.
+ */
+int svc_recv(struct svc_rqst *rqstp, long timeout)
+{
+       struct svc_xprt         *xprt = NULL;
+       struct svc_serv         *serv = rqstp->rq_server;
+       struct svc_pool         *pool = rqstp->rq_pool;
+       int                     len, i;
+       int                     pages;
+       struct xdr_buf          *arg;
+       DECLARE_WAITQUEUE(wait, current);
+
+       dprintk("svc: server %p waiting for data (to = %ld)\n",
+               rqstp, timeout);
+
+       if (rqstp->rq_xprt)
+               printk(KERN_ERR
+                       "svc_recv: service %p, transport not NULL!\n",
+                        rqstp);
+       if (waitqueue_active(&rqstp->rq_wait))
+               printk(KERN_ERR
+                       "svc_recv: service %p, wait queue active!\n",
+                        rqstp);
+
+       /* now allocate needed pages.  If we get a failure, sleep briefly */
+       pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
+       for (i = 0; i < pages ; i++)
+               while (rqstp->rq_pages[i] == NULL) {
+                       struct page *p = alloc_page(GFP_KERNEL);
+                       if (!p) {
+                               int j = msecs_to_jiffies(500);
+                               schedule_timeout_uninterruptible(j);
+                       }
+                       rqstp->rq_pages[i] = p;
+               }
+       rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
+       BUG_ON(pages >= RPCSVC_MAXPAGES);
+
+       /* Make arg->head point to first page and arg->pages point to rest */
+       arg = &rqstp->rq_arg;
+       arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
+       arg->head[0].iov_len = PAGE_SIZE;
+       arg->pages = rqstp->rq_pages + 1;
+       arg->page_base = 0;
+       /* save at least one page for response */
+       arg->page_len = (pages-2)*PAGE_SIZE;
+       arg->len = (pages-1)*PAGE_SIZE;
+       arg->tail[0].iov_len = 0;
+
+       try_to_freeze();
+       cond_resched();
+       if (signalled())
+               return -EINTR;
+
+       spin_lock_bh(&pool->sp_lock);
+       xprt = svc_xprt_dequeue(pool);
+       if (xprt) {
+               rqstp->rq_xprt = xprt;
+               svc_xprt_get(xprt);
+               rqstp->rq_reserved = serv->sv_max_mesg;
+               atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
+       } else {
+               /* No data pending. Go to sleep */
+               svc_thread_enqueue(pool, rqstp);
+
+               /*
+                * We have to be able to interrupt this wait
+                * to bring down the daemons ...
+                */
+               set_current_state(TASK_INTERRUPTIBLE);
+               add_wait_queue(&rqstp->rq_wait, &wait);
+               spin_unlock_bh(&pool->sp_lock);
+
+               schedule_timeout(timeout);
+
+               try_to_freeze();
+
+               spin_lock_bh(&pool->sp_lock);
+               remove_wait_queue(&rqstp->rq_wait, &wait);
+
+               xprt = rqstp->rq_xprt;
+               if (!xprt) {
+                       svc_thread_dequeue(pool, rqstp);
+                       spin_unlock_bh(&pool->sp_lock);
+                       dprintk("svc: server %p, no data yet\n", rqstp);
+                       return signalled()? -EINTR : -EAGAIN;
+               }
+       }
+       spin_unlock_bh(&pool->sp_lock);
+
+       len = 0;
+       if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) {
+               dprintk("svc_recv: found XPT_CLOSE\n");
+               svc_delete_xprt(xprt);
+       } else if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
+               struct svc_xprt *newxpt;
+               newxpt = xprt->xpt_ops->xpo_accept(xprt);
+               if (newxpt) {
+                       /*
+                        * We know this module_get will succeed because the
+                        * listener holds a reference too
+                        */
+                       __module_get(newxpt->xpt_class->xcl_owner);
+                       svc_check_conn_limits(xprt->xpt_server);
+                       spin_lock_bh(&serv->sv_lock);
+                       set_bit(XPT_TEMP, &newxpt->xpt_flags);
+                       list_add(&newxpt->xpt_list, &serv->sv_tempsocks);
+                       serv->sv_tmpcnt++;
+                       if (serv->sv_temptimer.function == NULL) {
+                               /* setup timer to age temp transports */
+                               setup_timer(&serv->sv_temptimer,
+                                           svc_age_temp_xprts,
+                                           (unsigned long)serv);
+                               mod_timer(&serv->sv_temptimer,
+                                         jiffies + svc_conn_age_period * HZ);
+                       }
+                       spin_unlock_bh(&serv->sv_lock);
+                       svc_xprt_received(newxpt);
+               }
+               svc_xprt_received(xprt);
+       } else {
+               dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
+                       rqstp, pool->sp_id, xprt,
+                       atomic_read(&xprt->xpt_ref.refcount));
+               rqstp->rq_deferred = svc_deferred_dequeue(xprt);
+               if (rqstp->rq_deferred) {
+                       svc_xprt_received(xprt);
+                       len = svc_deferred_recv(rqstp);
+               } else
+                       len = xprt->xpt_ops->xpo_recvfrom(rqstp);
+               dprintk("svc: got len=%d\n", len);
+       }
+
+       /* No data, incomplete (TCP) read, or accept() */
+       if (len == 0 || len == -EAGAIN) {
+               rqstp->rq_res.len = 0;
+               svc_xprt_release(rqstp);
+               return -EAGAIN;
+       }
+       clear_bit(XPT_OLD, &xprt->xpt_flags);
+
+       rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
+       rqstp->rq_chandle.defer = svc_defer;
+
+       if (serv->sv_stats)
+               serv->sv_stats->netcnt++;
+       return len;
+}
+EXPORT_SYMBOL(svc_recv);
+
+/*
+ * Drop request
+ */
+void svc_drop(struct svc_rqst *rqstp)
+{
+       dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt);
+       svc_xprt_release(rqstp);
+}
+EXPORT_SYMBOL(svc_drop);
+
+/*
+ * Return reply to client.
+ */
+int svc_send(struct svc_rqst *rqstp)
+{
+       struct svc_xprt *xprt;
+       int             len;
+       struct xdr_buf  *xb;
+
+       xprt = rqstp->rq_xprt;
+       if (!xprt)
+               return -EFAULT;
+
+       /* release the receive skb before sending the reply */
+       rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
+
+       /* calculate over-all length */
+       xb = &rqstp->rq_res;
+       xb->len = xb->head[0].iov_len +
+               xb->page_len +
+               xb->tail[0].iov_len;
+
+       /* Grab mutex to serialize outgoing data. */
+       mutex_lock(&xprt->xpt_mutex);
+       if (test_bit(XPT_DEAD, &xprt->xpt_flags))
+               len = -ENOTCONN;
+       else
+               len = xprt->xpt_ops->xpo_sendto(rqstp);
+       mutex_unlock(&xprt->xpt_mutex);
+       svc_xprt_release(rqstp);
+
+       if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
+               return 0;
+       return len;
+}
+
+/*
+ * Timer function to close old temporary transports, using
+ * a mark-and-sweep algorithm.
+ */
+static void svc_age_temp_xprts(unsigned long closure)
+{
+       struct svc_serv *serv = (struct svc_serv *)closure;
+       struct svc_xprt *xprt;
+       struct list_head *le, *next;
+       LIST_HEAD(to_be_aged);
+
+       dprintk("svc_age_temp_xprts\n");
+
+       if (!spin_trylock_bh(&serv->sv_lock)) {
+               /* busy, try again 1 sec later */
+               dprintk("svc_age_temp_xprts: busy\n");
+               mod_timer(&serv->sv_temptimer, jiffies + HZ);
+               return;
+       }
+
+       list_for_each_safe(le, next, &serv->sv_tempsocks) {
+               xprt = list_entry(le, struct svc_xprt, xpt_list);
+
+               /* First time through, just mark it OLD. Second time
+                * through, close it. */
+               if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
+                       continue;
+               if (atomic_read(&xprt->xpt_ref.refcount) > 1
+                   || test_bit(XPT_BUSY, &xprt->xpt_flags))
+                       continue;
+               svc_xprt_get(xprt);
+               list_move(le, &to_be_aged);
+               set_bit(XPT_CLOSE, &xprt->xpt_flags);
+               set_bit(XPT_DETACHED, &xprt->xpt_flags);
+       }
+       spin_unlock_bh(&serv->sv_lock);
+
+       while (!list_empty(&to_be_aged)) {
+               le = to_be_aged.next;
+               /* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */
+               list_del_init(le);
+               xprt = list_entry(le, struct svc_xprt, xpt_list);
+
+               dprintk("queuing xprt %p for closing\n", xprt);
+
+               /* a thread will dequeue and close it soon */
+               svc_xprt_enqueue(xprt);
+               svc_xprt_put(xprt);
+       }
+
+       mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
+}
+
+/*
+ * Remove a dead transport
+ */
+void svc_delete_xprt(struct svc_xprt *xprt)
+{
+       struct svc_serv *serv = xprt->xpt_server;
+
+       dprintk("svc: svc_delete_xprt(%p)\n", xprt);
+       xprt->xpt_ops->xpo_detach(xprt);
+
+       spin_lock_bh(&serv->sv_lock);
+       if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags))
+               list_del_init(&xprt->xpt_list);
+       /*
+        * We used to delete the transport from whichever list
+        * it's sk_xprt.xpt_ready node was on, but we don't actually
+        * need to.  This is because the only time we're called
+        * while still attached to a queue, the queue itself
+        * is about to be destroyed (in svc_destroy).
+        */
+       if (!test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) {
+               BUG_ON(atomic_read(&xprt->xpt_ref.refcount) < 2);
+               if (test_bit(XPT_TEMP, &xprt->xpt_flags))
+                       serv->sv_tmpcnt--;
+               svc_xprt_put(xprt);
+       }
+       spin_unlock_bh(&serv->sv_lock);
+}
+
+void svc_close_xprt(struct svc_xprt *xprt)
+{
+       set_bit(XPT_CLOSE, &xprt->xpt_flags);
+       if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags))
+               /* someone else will have to effect the close */
+               return;
+
+       svc_xprt_get(xprt);
+       svc_delete_xprt(xprt);
+       clear_bit(XPT_BUSY, &xprt->xpt_flags);
+       svc_xprt_put(xprt);
+}
+EXPORT_SYMBOL_GPL(svc_close_xprt);
+
+void svc_close_all(struct list_head *xprt_list)
+{
+       struct svc_xprt *xprt;
+       struct svc_xprt *tmp;
+
+       list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) {
+               set_bit(XPT_CLOSE, &xprt->xpt_flags);
+               if (test_bit(XPT_BUSY, &xprt->xpt_flags)) {
+                       /* Waiting to be processed, but no threads left,
+                        * So just remove it from the waiting list
+                        */
+                       list_del_init(&xprt->xpt_ready);
+                       clear_bit(XPT_BUSY, &xprt->xpt_flags);
+               }
+               svc_close_xprt(xprt);
+       }
+}
+
+/*
+ * Handle defer and revisit of requests
+ */
+
+static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
+{
+       struct svc_deferred_req *dr =
+               container_of(dreq, struct svc_deferred_req, handle);
+       struct svc_xprt *xprt = dr->xprt;
+
+       if (too_many) {
+               svc_xprt_put(xprt);
+               kfree(dr);
+               return;
+       }
+       dprintk("revisit queued\n");
+       dr->xprt = NULL;
+       spin_lock(&xprt->xpt_lock);
+       list_add(&dr->handle.recent, &xprt->xpt_deferred);
+       spin_unlock(&xprt->xpt_lock);
+       set_bit(XPT_DEFERRED, &xprt->xpt_flags);
+       svc_xprt_enqueue(xprt);
+       svc_xprt_put(xprt);
+}
+
+/*
+ * Save the request off for later processing. The request buffer looks
+ * like this:
+ *
+ * <xprt-header><rpc-header><rpc-pagelist><rpc-tail>
+ *
+ * This code can only handle requests that consist of an xprt-header
+ * and rpc-header.
+ */
+static struct cache_deferred_req *svc_defer(struct cache_req *req)
+{
+       struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
+       struct svc_deferred_req *dr;
+
+       if (rqstp->rq_arg.page_len)
+               return NULL; /* if more than a page, give up FIXME */
+       if (rqstp->rq_deferred) {
+               dr = rqstp->rq_deferred;
+               rqstp->rq_deferred = NULL;
+       } else {
+               size_t skip;
+               size_t size;
+               /* FIXME maybe discard if size too large */
+               size = sizeof(struct svc_deferred_req) + rqstp->rq_arg.len;
+               dr = kmalloc(size, GFP_KERNEL);
+               if (dr == NULL)
+                       return NULL;
+
+               dr->handle.owner = rqstp->rq_server;
+               dr->prot = rqstp->rq_prot;
+               memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen);
+               dr->addrlen = rqstp->rq_addrlen;
+               dr->daddr = rqstp->rq_daddr;
+               dr->argslen = rqstp->rq_arg.len >> 2;
+               dr->xprt_hlen = rqstp->rq_xprt_hlen;
+
+               /* back up head to the start of the buffer and copy */
+               skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
+               memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip,
+                      dr->argslen << 2);
+       }
+       svc_xprt_get(rqstp->rq_xprt);
+       dr->xprt = rqstp->rq_xprt;
+
+       dr->handle.revisit = svc_revisit;
+       return &dr->handle;
+}
+
+/*
+ * recv data from a deferred request into an active one
+ */
+static int svc_deferred_recv(struct svc_rqst *rqstp)
+{
+       struct svc_deferred_req *dr = rqstp->rq_deferred;
+
+       /* setup iov_base past transport header */
+       rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2);
+       /* The iov_len does not include the transport header bytes */
+       rqstp->rq_arg.head[0].iov_len = (dr->argslen<<2) - dr->xprt_hlen;
+       rqstp->rq_arg.page_len = 0;
+       /* The rq_arg.len includes the transport header bytes */
+       rqstp->rq_arg.len     = dr->argslen<<2;
+       rqstp->rq_prot        = dr->prot;
+       memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen);
+       rqstp->rq_addrlen     = dr->addrlen;
+       /* Save off transport header len in case we get deferred again */
+       rqstp->rq_xprt_hlen   = dr->xprt_hlen;
+       rqstp->rq_daddr       = dr->daddr;
+       rqstp->rq_respages    = rqstp->rq_pages;
+       return (dr->argslen<<2) - dr->xprt_hlen;
+}
+
+
+static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
+{
+       struct svc_deferred_req *dr = NULL;
+
+       if (!test_bit(XPT_DEFERRED, &xprt->xpt_flags))
+               return NULL;
+       spin_lock(&xprt->xpt_lock);
+       clear_bit(XPT_DEFERRED, &xprt->xpt_flags);
+       if (!list_empty(&xprt->xpt_deferred)) {
+               dr = list_entry(xprt->xpt_deferred.next,
+                               struct svc_deferred_req,
+                               handle.recent);
+               list_del_init(&dr->handle.recent);
+               set_bit(XPT_DEFERRED, &xprt->xpt_flags);
+       }
+       spin_unlock(&xprt->xpt_lock);
+       return dr;
+}
+
+/*
+ * Return the transport instance pointer for the endpoint accepting
+ * connections/peer traffic from the specified transport class,
+ * address family and port.
+ *
+ * Specifying 0 for the address family or port is effectively a
+ * wild-card, and will result in matching the first transport in the
+ * service's list that has a matching class name.
+ */
+struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name,
+                              int af, int port)
+{
+       struct svc_xprt *xprt;
+       struct svc_xprt *found = NULL;
+
+       /* Sanity check the args */
+       if (!serv || !xcl_name)
+               return found;
+
+       spin_lock_bh(&serv->sv_lock);
+       list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
+               if (strcmp(xprt->xpt_class->xcl_name, xcl_name))
+                       continue;
+               if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family)
+                       continue;
+               if (port && port != svc_xprt_local_port(xprt))
+                       continue;
+               found = xprt;
+               svc_xprt_get(xprt);
+               break;
+       }
+       spin_unlock_bh(&serv->sv_lock);
+       return found;
+}
+EXPORT_SYMBOL_GPL(svc_find_xprt);
+
+/*
+ * Format a buffer with a list of the active transports. A zero for
+ * the buflen parameter disables target buffer overflow checking.
+ */
+int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen)
+{
+       struct svc_xprt *xprt;
+       char xprt_str[64];
+       int totlen = 0;
+       int len;
+
+       /* Sanity check args */
+       if (!serv)
+               return 0;
+
+       spin_lock_bh(&serv->sv_lock);
+       list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
+               len = snprintf(xprt_str, sizeof(xprt_str),
+                              "%s %d\n", xprt->xpt_class->xcl_name,
+                              svc_xprt_local_port(xprt));
+               /* If the string was truncated, replace with error string */
+               if (len >= sizeof(xprt_str))
+                       strcpy(xprt_str, "name-too-long\n");
+               /* Don't overflow buffer */
+               len = strlen(xprt_str);
+               if (buflen && (len + totlen >= buflen))
+                       break;
+               strcpy(buf+totlen, xprt_str);
+               totlen += len;
+       }
+       spin_unlock_bh(&serv->sv_lock);
+       return totlen;
+}
+EXPORT_SYMBOL_GPL(svc_xprt_names);
index af7c5f05c6e11b3d5d734d1ee2143e54c71517bb..8a73cbb16052397b0455070cfc670a931ff2ec8f 100644 (file)
@@ -57,11 +57,13 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
        rqstp->rq_authop = aops;
        return aops->accept(rqstp, authp);
 }
+EXPORT_SYMBOL(svc_authenticate);
 
 int svc_set_client(struct svc_rqst *rqstp)
 {
        return rqstp->rq_authop->set_client(rqstp);
 }
+EXPORT_SYMBOL(svc_set_client);
 
 /* A request, which was authenticated, has now executed.
  * Time to finalise the credentials and verifier
@@ -93,6 +95,7 @@ svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops)
        spin_unlock(&authtab_lock);
        return rv;
 }
+EXPORT_SYMBOL(svc_auth_register);
 
 void
 svc_auth_unregister(rpc_authflavor_t flavor)
@@ -129,6 +132,7 @@ void auth_domain_put(struct auth_domain *dom)
                spin_unlock(&auth_domain_lock);
        }
 }
+EXPORT_SYMBOL(auth_domain_put);
 
 struct auth_domain *
 auth_domain_lookup(char *name, struct auth_domain *new)
@@ -153,8 +157,10 @@ auth_domain_lookup(char *name, struct auth_domain *new)
        spin_unlock(&auth_domain_lock);
        return new;
 }
+EXPORT_SYMBOL(auth_domain_lookup);
 
 struct auth_domain *auth_domain_find(char *name)
 {
        return auth_domain_lookup(name, NULL);
 }
+EXPORT_SYMBOL(auth_domain_find);
index 411479411b21052a3e138c5109880abd881b77b4..3c64051e455533aeb11df479f3c72877838e4483 100644 (file)
@@ -63,6 +63,7 @@ struct auth_domain *unix_domain_find(char *name)
                rv = auth_domain_lookup(name, &new->h);
        }
 }
+EXPORT_SYMBOL(unix_domain_find);
 
 static void svcauth_unix_domain_release(struct auth_domain *dom)
 {
@@ -340,6 +341,7 @@ int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom)
        else
                return -ENOMEM;
 }
+EXPORT_SYMBOL(auth_unix_add_addr);
 
 int auth_unix_forget_old(struct auth_domain *dom)
 {
@@ -351,6 +353,7 @@ int auth_unix_forget_old(struct auth_domain *dom)
        udom->addr_changes++;
        return 0;
 }
+EXPORT_SYMBOL(auth_unix_forget_old);
 
 struct auth_domain *auth_unix_lookup(struct in_addr addr)
 {
@@ -375,50 +378,56 @@ struct auth_domain *auth_unix_lookup(struct in_addr addr)
        cache_put(&ipm->h, &ip_map_cache);
        return rv;
 }
+EXPORT_SYMBOL(auth_unix_lookup);
 
 void svcauth_unix_purge(void)
 {
        cache_purge(&ip_map_cache);
 }
+EXPORT_SYMBOL(svcauth_unix_purge);
 
 static inline struct ip_map *
 ip_map_cached_get(struct svc_rqst *rqstp)
 {
-       struct ip_map *ipm;
-       struct svc_sock *svsk = rqstp->rq_sock;
-       spin_lock(&svsk->sk_lock);
-       ipm = svsk->sk_info_authunix;
-       if (ipm != NULL) {
-               if (!cache_valid(&ipm->h)) {
-                       /*
-                        * The entry has been invalidated since it was
-                        * remembered, e.g. by a second mount from the
-                        * same IP address.
-                        */
-                       svsk->sk_info_authunix = NULL;
-                       spin_unlock(&svsk->sk_lock);
-                       cache_put(&ipm->h, &ip_map_cache);
-                       return NULL;
+       struct ip_map *ipm = NULL;
+       struct svc_xprt *xprt = rqstp->rq_xprt;
+
+       if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
+               spin_lock(&xprt->xpt_lock);
+               ipm = xprt->xpt_auth_cache;
+               if (ipm != NULL) {
+                       if (!cache_valid(&ipm->h)) {
+                               /*
+                                * The entry has been invalidated since it was
+                                * remembered, e.g. by a second mount from the
+                                * same IP address.
+                                */
+                               xprt->xpt_auth_cache = NULL;
+                               spin_unlock(&xprt->xpt_lock);
+                               cache_put(&ipm->h, &ip_map_cache);
+                               return NULL;
+                       }
+                       cache_get(&ipm->h);
                }
-               cache_get(&ipm->h);
+               spin_unlock(&xprt->xpt_lock);
        }
-       spin_unlock(&svsk->sk_lock);
        return ipm;
 }
 
 static inline void
 ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm)
 {
-       struct svc_sock *svsk = rqstp->rq_sock;
+       struct svc_xprt *xprt = rqstp->rq_xprt;
 
-       spin_lock(&svsk->sk_lock);
-       if (svsk->sk_sock->type == SOCK_STREAM &&
-           svsk->sk_info_authunix == NULL) {
-               /* newly cached, keep the reference */
-               svsk->sk_info_authunix = ipm;
-               ipm = NULL;
+       if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
+               spin_lock(&xprt->xpt_lock);
+               if (xprt->xpt_auth_cache == NULL) {
+                       /* newly cached, keep the reference */
+                       xprt->xpt_auth_cache = ipm;
+                       ipm = NULL;
+               }
+               spin_unlock(&xprt->xpt_lock);
        }
-       spin_unlock(&svsk->sk_lock);
        if (ipm)
                cache_put(&ipm->h, &ip_map_cache);
 }
index c75bffeb89eb705831585ba1447c0ba77dd06a9c..1d3e5fcc2cc4d0728f4d99c39d28ecd1fb9a2d80 100644 (file)
@@ -5,7 +5,7 @@
  *
  * The server scheduling algorithm does not always distribute the load
  * evenly when servicing a single client. May need to modify the
- * svc_sock_enqueue procedure...
+ * svc_xprt_enqueue procedure...
  *
  * TCP support is largely untested and may be a little slow. The problem
  * is that we currently do two separate recvfrom's, one for the 4-byte
 #include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/stats.h>
 
-/* SMP locking strategy:
- *
- *     svc_pool->sp_lock protects most of the fields of that pool.
- *     svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
- *     when both need to be taken (rare), svc_serv->sv_lock is first.
- *     BKL protects svc_serv->sv_nrthread.
- *     svc_sock->sk_lock protects the svc_sock->sk_deferred list
- *             and the ->sk_info_authunix cache.
- *     svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply.
- *
- *     Some flags can be set to certain values at any time
- *     providing that certain rules are followed:
- *
- *     SK_CONN, SK_DATA, can be set or cleared at any time.
- *             after a set, svc_sock_enqueue must be called.
- *             after a clear, the socket must be read/accepted
- *              if this succeeds, it must be set again.
- *     SK_CLOSE can set at any time. It is never cleared.
- *      sk_inuse contains a bias of '1' until SK_DEAD is set.
- *             so when sk_inuse hits zero, we know the socket is dead
- *             and no-one is using it.
- *      SK_DEAD can only be set while SK_BUSY is held which ensures
- *             no other thread will be using the socket or will try to
- *            set SK_DEAD.
- *
- */
-
-#define RPCDBG_FACILITY        RPCDBG_SVCSOCK
+#define RPCDBG_FACILITY        RPCDBG_SVCXPRT
 
 
 static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
                                         int *errp, int flags);
-static void            svc_delete_socket(struct svc_sock *svsk);
 static void            svc_udp_data_ready(struct sock *, int);
 static int             svc_udp_recvfrom(struct svc_rqst *);
 static int             svc_udp_sendto(struct svc_rqst *);
-static void            svc_close_socket(struct svc_sock *svsk);
-
-static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk);
-static int svc_deferred_recv(struct svc_rqst *rqstp);
-static struct cache_deferred_req *svc_defer(struct cache_req *req);
-
-/* apparently the "standard" is that clients close
- * idle connections after 5 minutes, servers after
- * 6 minutes
- *   http://www.connectathon.org/talks96/nfstcp.pdf
- */
-static int svc_conn_age_period = 6*60;
+static void            svc_sock_detach(struct svc_xprt *);
+static void            svc_sock_free(struct svc_xprt *);
 
+static struct svc_xprt *svc_create_socket(struct svc_serv *, int,
+                                         struct sockaddr *, int, int);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key svc_key[2];
 static struct lock_class_key svc_slock_key[2];
 
-static inline void svc_reclassify_socket(struct socket *sock)
+static void svc_reclassify_socket(struct socket *sock)
 {
        struct sock *sk = sock->sk;
        BUG_ON(sock_owned_by_user(sk));
        switch (sk->sk_family) {
        case AF_INET:
                sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD",
-                   &svc_slock_key[0], "sk_lock-AF_INET-NFSD", &svc_key[0]);
+                                             &svc_slock_key[0],
+                                             "sk_xprt.xpt_lock-AF_INET-NFSD",
+                                             &svc_key[0]);
                break;
 
        case AF_INET6:
                sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD",
-                   &svc_slock_key[1], "sk_lock-AF_INET6-NFSD", &svc_key[1]);
+                                             &svc_slock_key[1],
+                                             "sk_xprt.xpt_lock-AF_INET6-NFSD",
+                                             &svc_key[1]);
                break;
 
        default:
@@ -121,81 +89,26 @@ static inline void svc_reclassify_socket(struct socket *sock)
        }
 }
 #else
-static inline void svc_reclassify_socket(struct socket *sock)
+static void svc_reclassify_socket(struct socket *sock)
 {
 }
 #endif
 
-static char *__svc_print_addr(struct sockaddr *addr, char *buf, size_t len)
-{
-       switch (addr->sa_family) {
-       case AF_INET:
-               snprintf(buf, len, "%u.%u.%u.%u, port=%u",
-                       NIPQUAD(((struct sockaddr_in *) addr)->sin_addr),
-                       ntohs(((struct sockaddr_in *) addr)->sin_port));
-               break;
-
-       case AF_INET6:
-               snprintf(buf, len, "%x:%x:%x:%x:%x:%x:%x:%x, port=%u",
-                       NIP6(((struct sockaddr_in6 *) addr)->sin6_addr),
-                       ntohs(((struct sockaddr_in6 *) addr)->sin6_port));
-               break;
-
-       default:
-               snprintf(buf, len, "unknown address type: %d", addr->sa_family);
-               break;
-       }
-       return buf;
-}
-
-/**
- * svc_print_addr - Format rq_addr field for printing
- * @rqstp: svc_rqst struct containing address to print
- * @buf: target buffer for formatted address
- * @len: length of target buffer
- *
- */
-char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
-{
-       return __svc_print_addr(svc_addr(rqstp), buf, len);
-}
-EXPORT_SYMBOL_GPL(svc_print_addr);
-
-/*
- * Queue up an idle server thread.  Must have pool->sp_lock held.
- * Note: this is really a stack rather than a queue, so that we only
- * use as many different threads as we need, and the rest don't pollute
- * the cache.
- */
-static inline void
-svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
-{
-       list_add(&rqstp->rq_list, &pool->sp_threads);
-}
-
-/*
- * Dequeue an nfsd thread.  Must have pool->sp_lock held.
- */
-static inline void
-svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
-{
-       list_del(&rqstp->rq_list);
-}
-
 /*
  * Release an skbuff after use
  */
-static inline void
-svc_release_skb(struct svc_rqst *rqstp)
+static void svc_release_skb(struct svc_rqst *rqstp)
 {
-       struct sk_buff *skb = rqstp->rq_skbuff;
+       struct sk_buff *skb = rqstp->rq_xprt_ctxt;
        struct svc_deferred_req *dr = rqstp->rq_deferred;
 
        if (skb) {
-               rqstp->rq_skbuff = NULL;
+               struct svc_sock *svsk =
+                       container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+               rqstp->rq_xprt_ctxt = NULL;
 
                dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
-               skb_free_datagram(rqstp->rq_sock->sk_sk, skb);
+               skb_free_datagram(svsk->sk_sk, skb);
        }
        if (dr) {
                rqstp->rq_deferred = NULL;
@@ -203,253 +116,6 @@ svc_release_skb(struct svc_rqst *rqstp)
        }
 }
 
-/*
- * Any space to write?
- */
-static inline unsigned long
-svc_sock_wspace(struct svc_sock *svsk)
-{
-       int wspace;
-
-       if (svsk->sk_sock->type == SOCK_STREAM)
-               wspace = sk_stream_wspace(svsk->sk_sk);
-       else
-               wspace = sock_wspace(svsk->sk_sk);
-
-       return wspace;
-}
-
-/*
- * Queue up a socket with data pending. If there are idle nfsd
- * processes, wake 'em up.
- *
- */
-static void
-svc_sock_enqueue(struct svc_sock *svsk)
-{
-       struct svc_serv *serv = svsk->sk_server;
-       struct svc_pool *pool;
-       struct svc_rqst *rqstp;
-       int cpu;
-
-       if (!(svsk->sk_flags &
-             ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) ))
-               return;
-       if (test_bit(SK_DEAD, &svsk->sk_flags))
-               return;
-
-       cpu = get_cpu();
-       pool = svc_pool_for_cpu(svsk->sk_server, cpu);
-       put_cpu();
-
-       spin_lock_bh(&pool->sp_lock);
-
-       if (!list_empty(&pool->sp_threads) &&
-           !list_empty(&pool->sp_sockets))
-               printk(KERN_ERR
-                       "svc_sock_enqueue: threads and sockets both waiting??\n");
-
-       if (test_bit(SK_DEAD, &svsk->sk_flags)) {
-               /* Don't enqueue dead sockets */
-               dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk);
-               goto out_unlock;
-       }
-
-       /* Mark socket as busy. It will remain in this state until the
-        * server has processed all pending data and put the socket back
-        * on the idle list.  We update SK_BUSY atomically because
-        * it also guards against trying to enqueue the svc_sock twice.
-        */
-       if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) {
-               /* Don't enqueue socket while already enqueued */
-               dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk);
-               goto out_unlock;
-       }
-       BUG_ON(svsk->sk_pool != NULL);
-       svsk->sk_pool = pool;
-
-       set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
-       if (((atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg)*2
-            > svc_sock_wspace(svsk))
-           && !test_bit(SK_CLOSE, &svsk->sk_flags)
-           && !test_bit(SK_CONN, &svsk->sk_flags)) {
-               /* Don't enqueue while not enough space for reply */
-               dprintk("svc: socket %p  no space, %d*2 > %ld, not enqueued\n",
-                       svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_max_mesg,
-                       svc_sock_wspace(svsk));
-               svsk->sk_pool = NULL;
-               clear_bit(SK_BUSY, &svsk->sk_flags);
-               goto out_unlock;
-       }
-       clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
-
-
-       if (!list_empty(&pool->sp_threads)) {
-               rqstp = list_entry(pool->sp_threads.next,
-                                  struct svc_rqst,
-                                  rq_list);
-               dprintk("svc: socket %p served by daemon %p\n",
-                       svsk->sk_sk, rqstp);
-               svc_thread_dequeue(pool, rqstp);
-               if (rqstp->rq_sock)
-                       printk(KERN_ERR
-                               "svc_sock_enqueue: server %p, rq_sock=%p!\n",
-                               rqstp, rqstp->rq_sock);
-               rqstp->rq_sock = svsk;
-               atomic_inc(&svsk->sk_inuse);
-               rqstp->rq_reserved = serv->sv_max_mesg;
-               atomic_add(rqstp->rq_reserved, &svsk->sk_reserved);
-               BUG_ON(svsk->sk_pool != pool);
-               wake_up(&rqstp->rq_wait);
-       } else {
-               dprintk("svc: socket %p put into queue\n", svsk->sk_sk);
-               list_add_tail(&svsk->sk_ready, &pool->sp_sockets);
-               BUG_ON(svsk->sk_pool != pool);
-       }
-
-out_unlock:
-       spin_unlock_bh(&pool->sp_lock);
-}
-
-/*
- * Dequeue the first socket.  Must be called with the pool->sp_lock held.
- */
-static inline struct svc_sock *
-svc_sock_dequeue(struct svc_pool *pool)
-{
-       struct svc_sock *svsk;
-
-       if (list_empty(&pool->sp_sockets))
-               return NULL;
-
-       svsk = list_entry(pool->sp_sockets.next,
-                         struct svc_sock, sk_ready);
-       list_del_init(&svsk->sk_ready);
-
-       dprintk("svc: socket %p dequeued, inuse=%d\n",
-               svsk->sk_sk, atomic_read(&svsk->sk_inuse));
-
-       return svsk;
-}
-
-/*
- * Having read something from a socket, check whether it
- * needs to be re-enqueued.
- * Note: SK_DATA only gets cleared when a read-attempt finds
- * no (or insufficient) data.
- */
-static inline void
-svc_sock_received(struct svc_sock *svsk)
-{
-       svsk->sk_pool = NULL;
-       clear_bit(SK_BUSY, &svsk->sk_flags);
-       svc_sock_enqueue(svsk);
-}
-
-
-/**
- * svc_reserve - change the space reserved for the reply to a request.
- * @rqstp:  The request in question
- * @space: new max space to reserve
- *
- * Each request reserves some space on the output queue of the socket
- * to make sure the reply fits.  This function reduces that reserved
- * space to be the amount of space used already, plus @space.
- *
- */
-void svc_reserve(struct svc_rqst *rqstp, int space)
-{
-       space += rqstp->rq_res.head[0].iov_len;
-
-       if (space < rqstp->rq_reserved) {
-               struct svc_sock *svsk = rqstp->rq_sock;
-               atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved);
-               rqstp->rq_reserved = space;
-
-               svc_sock_enqueue(svsk);
-       }
-}
-
-/*
- * Release a socket after use.
- */
-static inline void
-svc_sock_put(struct svc_sock *svsk)
-{
-       if (atomic_dec_and_test(&svsk->sk_inuse)) {
-               BUG_ON(! test_bit(SK_DEAD, &svsk->sk_flags));
-
-               dprintk("svc: releasing dead socket\n");
-               if (svsk->sk_sock->file)
-                       sockfd_put(svsk->sk_sock);
-               else
-                       sock_release(svsk->sk_sock);
-               if (svsk->sk_info_authunix != NULL)
-                       svcauth_unix_info_release(svsk->sk_info_authunix);
-               kfree(svsk);
-       }
-}
-
-static void
-svc_sock_release(struct svc_rqst *rqstp)
-{
-       struct svc_sock *svsk = rqstp->rq_sock;
-
-       svc_release_skb(rqstp);
-
-       svc_free_res_pages(rqstp);
-       rqstp->rq_res.page_len = 0;
-       rqstp->rq_res.page_base = 0;
-
-
-       /* Reset response buffer and release
-        * the reservation.
-        * But first, check that enough space was reserved
-        * for the reply, otherwise we have a bug!
-        */
-       if ((rqstp->rq_res.len) >  rqstp->rq_reserved)
-               printk(KERN_ERR "RPC request reserved %d but used %d\n",
-                      rqstp->rq_reserved,
-                      rqstp->rq_res.len);
-
-       rqstp->rq_res.head[0].iov_len = 0;
-       svc_reserve(rqstp, 0);
-       rqstp->rq_sock = NULL;
-
-       svc_sock_put(svsk);
-}
-
-/*
- * External function to wake up a server waiting for data
- * This really only makes sense for services like lockd
- * which have exactly one thread anyway.
- */
-void
-svc_wake_up(struct svc_serv *serv)
-{
-       struct svc_rqst *rqstp;
-       unsigned int i;
-       struct svc_pool *pool;
-
-       for (i = 0; i < serv->sv_nrpools; i++) {
-               pool = &serv->sv_pools[i];
-
-               spin_lock_bh(&pool->sp_lock);
-               if (!list_empty(&pool->sp_threads)) {
-                       rqstp = list_entry(pool->sp_threads.next,
-                                          struct svc_rqst,
-                                          rq_list);
-                       dprintk("svc: daemon %p woken up.\n", rqstp);
-                       /*
-                       svc_thread_dequeue(pool, rqstp);
-                       rqstp->rq_sock = NULL;
-                        */
-                       wake_up(&rqstp->rq_wait);
-               }
-               spin_unlock_bh(&pool->sp_lock);
-       }
-}
-
 union svc_pktinfo_u {
        struct in_pktinfo pkti;
        struct in6_pktinfo pkti6;
@@ -459,7 +125,9 @@ union svc_pktinfo_u {
 
 static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
 {
-       switch (rqstp->rq_sock->sk_sk->sk_family) {
+       struct svc_sock *svsk =
+               container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+       switch (svsk->sk_sk->sk_family) {
        case AF_INET: {
                        struct in_pktinfo *pki = CMSG_DATA(cmh);
 
@@ -489,10 +157,10 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
 /*
  * Generic sendto routine
  */
-static int
-svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
+static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
 {
-       struct svc_sock *svsk = rqstp->rq_sock;
+       struct svc_sock *svsk =
+               container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
        struct socket   *sock = svsk->sk_sock;
        int             slen;
        union {
@@ -565,7 +233,7 @@ svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
        }
 out:
        dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
-               rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len,
+               svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
                xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
 
        return len;
@@ -602,7 +270,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose)
        if (!serv)
                return 0;
        spin_lock_bh(&serv->sv_lock);
-       list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) {
+       list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) {
                int onelen = one_sock_name(buf+len, svsk);
                if (toclose && strcmp(toclose, buf+len) == 0)
                        closesk = svsk;
@@ -614,7 +282,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose)
                /* Should unregister with portmap, but you cannot
                 * unregister just one protocol...
                 */
-               svc_close_socket(closesk);
+               svc_close_xprt(&closesk->sk_xprt);
        else if (toclose)
                return -ENOENT;
        return len;
@@ -624,8 +292,7 @@ EXPORT_SYMBOL(svc_sock_names);
 /*
  * Check input queue length
  */
-static int
-svc_recv_available(struct svc_sock *svsk)
+static int svc_recv_available(struct svc_sock *svsk)
 {
        struct socket   *sock = svsk->sk_sock;
        int             avail, err;
@@ -638,48 +305,31 @@ svc_recv_available(struct svc_sock *svsk)
 /*
  * Generic recvfrom routine.
  */
-static int
-svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen)
+static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
+                       int buflen)
 {
-       struct svc_sock *svsk = rqstp->rq_sock;
+       struct svc_sock *svsk =
+               container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
        struct msghdr msg = {
                .msg_flags      = MSG_DONTWAIT,
        };
-       struct sockaddr *sin;
        int len;
 
+       rqstp->rq_xprt_hlen = 0;
+
        len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen,
                                msg.msg_flags);
 
-       /* sock_recvmsg doesn't fill in the name/namelen, so we must..
-        */
-       memcpy(&rqstp->rq_addr, &svsk->sk_remote, svsk->sk_remotelen);
-       rqstp->rq_addrlen = svsk->sk_remotelen;
-
-       /* Destination address in request is needed for binding the
-        * source address in RPC callbacks later.
-        */
-       sin = (struct sockaddr *)&svsk->sk_local;
-       switch (sin->sa_family) {
-       case AF_INET:
-               rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr;
-               break;
-       case AF_INET6:
-               rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr;
-               break;
-       }
-
        dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
                svsk, iov[0].iov_base, iov[0].iov_len, len);
-
        return len;
 }
 
 /*
  * Set socket snd and rcv buffer lengths
  */
-static inline void
-svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv)
+static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
+                               unsigned int rcv)
 {
 #if 0
        mm_segment_t    oldfs;
@@ -704,16 +354,16 @@ svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv)
 /*
  * INET callback when data has been received on the socket.
  */
-static void
-svc_udp_data_ready(struct sock *sk, int count)
+static void svc_udp_data_ready(struct sock *sk, int count)
 {
        struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
 
        if (svsk) {
                dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
-                       svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags));
-               set_bit(SK_DATA, &svsk->sk_flags);
-               svc_sock_enqueue(svsk);
+                       svsk, sk, count,
+                       test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
+               set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+               svc_xprt_enqueue(&svsk->sk_xprt);
        }
        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
                wake_up_interruptible(sk->sk_sleep);
@@ -722,15 +372,14 @@ svc_udp_data_ready(struct sock *sk, int count)
 /*
  * INET callback when space is newly available on the socket.
  */
-static void
-svc_write_space(struct sock *sk)
+static void svc_write_space(struct sock *sk)
 {
        struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
 
        if (svsk) {
                dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
-                       svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags));
-               svc_sock_enqueue(svsk);
+                       svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
+               svc_xprt_enqueue(&svsk->sk_xprt);
        }
 
        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) {
@@ -740,10 +389,19 @@ svc_write_space(struct sock *sk)
        }
 }
 
-static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp,
-                                           struct cmsghdr *cmh)
+/*
+ * Copy the UDP datagram's destination address to the rqstp structure.
+ * The 'destination' address in this case is the address to which the
+ * peer sent the datagram, i.e. our local address. For multihomed
+ * hosts, this can change from msg to msg. Note that only the IP
+ * address changes, the port number should remain the same.
+ */
+static void svc_udp_get_dest_address(struct svc_rqst *rqstp,
+                                    struct cmsghdr *cmh)
 {
-       switch (rqstp->rq_sock->sk_sk->sk_family) {
+       struct svc_sock *svsk =
+               container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+       switch (svsk->sk_sk->sk_family) {
        case AF_INET: {
                struct in_pktinfo *pki = CMSG_DATA(cmh);
                rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr;
@@ -760,11 +418,11 @@ static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp,
 /*
  * Receive a datagram from a UDP socket.
  */
-static int
-svc_udp_recvfrom(struct svc_rqst *rqstp)
+static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 {
-       struct svc_sock *svsk = rqstp->rq_sock;
-       struct svc_serv *serv = svsk->sk_server;
+       struct svc_sock *svsk =
+               container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+       struct svc_serv *serv = svsk->sk_xprt.xpt_server;
        struct sk_buff  *skb;
        union {
                struct cmsghdr  hdr;
@@ -779,7 +437,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
                .msg_flags = MSG_DONTWAIT,
        };
 
-       if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
+       if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
            /* udp sockets need large rcvbuf as all pending
             * requests are still in that buffer.  sndbuf must
             * also be large enough that there is enough space
@@ -792,17 +450,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
                                (serv->sv_nrthreads+3) * serv->sv_max_mesg,
                                (serv->sv_nrthreads+3) * serv->sv_max_mesg);
 
-       if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) {
-               svc_sock_received(svsk);
-               return svc_deferred_recv(rqstp);
-       }
-
-       if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
-               svc_delete_socket(svsk);
-               return 0;
-       }
-
-       clear_bit(SK_DATA, &svsk->sk_flags);
+       clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
        skb = NULL;
        err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
                             0, 0, MSG_PEEK | MSG_DONTWAIT);
@@ -813,24 +461,27 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
                if (err != -EAGAIN) {
                        /* possibly an icmp error */
                        dprintk("svc: recvfrom returned error %d\n", -err);
-                       set_bit(SK_DATA, &svsk->sk_flags);
+                       set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
                }
-               svc_sock_received(svsk);
+               svc_xprt_received(&svsk->sk_xprt);
                return -EAGAIN;
        }
-       rqstp->rq_addrlen = sizeof(rqstp->rq_addr);
+       len = svc_addr_len(svc_addr(rqstp));
+       if (len < 0)
+               return len;
+       rqstp->rq_addrlen = len;
        if (skb->tstamp.tv64 == 0) {
                skb->tstamp = ktime_get_real();
                /* Don't enable netstamp, sunrpc doesn't
                   need that much accuracy */
        }
        svsk->sk_sk->sk_stamp = skb->tstamp;
-       set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */
+       set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
 
        /*
         * Maybe more packets - kick another thread ASAP.
         */
-       svc_sock_received(svsk);
+       svc_xprt_received(&svsk->sk_xprt);
 
        len  = skb->len - sizeof(struct udphdr);
        rqstp->rq_arg.len = len;
@@ -861,13 +512,14 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
                skb_free_datagram(svsk->sk_sk, skb);
        } else {
                /* we can use it in-place */
-               rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr);
+               rqstp->rq_arg.head[0].iov_base = skb->data +
+                       sizeof(struct udphdr);
                rqstp->rq_arg.head[0].iov_len = len;
                if (skb_checksum_complete(skb)) {
                        skb_free_datagram(svsk->sk_sk, skb);
                        return 0;
                }
-               rqstp->rq_skbuff = skb;
+               rqstp->rq_xprt_ctxt = skb;
        }
 
        rqstp->rq_arg.page_base = 0;
@@ -900,27 +552,81 @@ svc_udp_sendto(struct svc_rqst *rqstp)
        return error;
 }
 
-static void
-svc_udp_init(struct svc_sock *svsk)
+static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp)
+{
+}
+
+static int svc_udp_has_wspace(struct svc_xprt *xprt)
+{
+       struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+       struct svc_serv *serv = xprt->xpt_server;
+       unsigned long required;
+
+       /*
+        * Set the SOCK_NOSPACE flag before checking the available
+        * sock space.
+        */
+       set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+       required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
+       if (required*2 > sock_wspace(svsk->sk_sk))
+               return 0;
+       clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+       return 1;
+}
+
+static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
+{
+       BUG();
+       return NULL;
+}
+
+static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
+                                      struct sockaddr *sa, int salen,
+                                      int flags)
+{
+       return svc_create_socket(serv, IPPROTO_UDP, sa, salen, flags);
+}
+
+static struct svc_xprt_ops svc_udp_ops = {
+       .xpo_create = svc_udp_create,
+       .xpo_recvfrom = svc_udp_recvfrom,
+       .xpo_sendto = svc_udp_sendto,
+       .xpo_release_rqst = svc_release_skb,
+       .xpo_detach = svc_sock_detach,
+       .xpo_free = svc_sock_free,
+       .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
+       .xpo_has_wspace = svc_udp_has_wspace,
+       .xpo_accept = svc_udp_accept,
+};
+
+static struct svc_xprt_class svc_udp_class = {
+       .xcl_name = "udp",
+       .xcl_owner = THIS_MODULE,
+       .xcl_ops = &svc_udp_ops,
+       .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP,
+};
+
+static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
 {
        int one = 1;
        mm_segment_t oldfs;
 
+       svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv);
+       clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
        svsk->sk_sk->sk_data_ready = svc_udp_data_ready;
        svsk->sk_sk->sk_write_space = svc_write_space;
-       svsk->sk_recvfrom = svc_udp_recvfrom;
-       svsk->sk_sendto = svc_udp_sendto;
 
        /* initialise setting must have enough space to
         * receive and respond to one request.
         * svc_udp_recvfrom will re-adjust if necessary
         */
        svc_sock_setbufsize(svsk->sk_sock,
-                           3 * svsk->sk_server->sv_max_mesg,
-                           3 * svsk->sk_server->sv_max_mesg);
+                           3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
+                           3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
 
-       set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */
-       set_bit(SK_CHNGBUF, &svsk->sk_flags);
+       /* data might have come in before data_ready set up */
+       set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+       set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
 
        oldfs = get_fs();
        set_fs(KERNEL_DS);
@@ -934,8 +640,7 @@ svc_udp_init(struct svc_sock *svsk)
  * A data_ready event on a listening socket means there's a connection
  * pending. Do not use state_change as a substitute for it.
  */
-static void
-svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
+static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
 {
        struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
 
@@ -954,8 +659,8 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
         */
        if (sk->sk_state == TCP_LISTEN) {
                if (svsk) {
-                       set_bit(SK_CONN, &svsk->sk_flags);
-                       svc_sock_enqueue(svsk);
+                       set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
+                       svc_xprt_enqueue(&svsk->sk_xprt);
                } else
                        printk("svc: socket %p: no user data\n", sk);
        }
@@ -967,8 +672,7 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
 /*
  * A state change on a connected socket means it's dying or dead.
  */
-static void
-svc_tcp_state_change(struct sock *sk)
+static void svc_tcp_state_change(struct sock *sk)
 {
        struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
 
@@ -978,51 +682,36 @@ svc_tcp_state_change(struct sock *sk)
        if (!svsk)
                printk("svc: socket %p: no user data\n", sk);
        else {
-               set_bit(SK_CLOSE, &svsk->sk_flags);
-               svc_sock_enqueue(svsk);
+               set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+               svc_xprt_enqueue(&svsk->sk_xprt);
        }
        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
                wake_up_interruptible_all(sk->sk_sleep);
 }
 
-static void
-svc_tcp_data_ready(struct sock *sk, int count)
+static void svc_tcp_data_ready(struct sock *sk, int count)
 {
        struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
 
        dprintk("svc: socket %p TCP data ready (svsk %p)\n",
                sk, sk->sk_user_data);
        if (svsk) {
-               set_bit(SK_DATA, &svsk->sk_flags);
-               svc_sock_enqueue(svsk);
+               set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+               svc_xprt_enqueue(&svsk->sk_xprt);
        }
        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
                wake_up_interruptible(sk->sk_sleep);
 }
 
-static inline int svc_port_is_privileged(struct sockaddr *sin)
-{
-       switch (sin->sa_family) {
-       case AF_INET:
-               return ntohs(((struct sockaddr_in *)sin)->sin_port)
-                       < PROT_SOCK;
-       case AF_INET6:
-               return ntohs(((struct sockaddr_in6 *)sin)->sin6_port)
-                       < PROT_SOCK;
-       default:
-               return 0;
-       }
-}
-
 /*
  * Accept a TCP connection
  */
-static void
-svc_tcp_accept(struct svc_sock *svsk)
+static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
 {
+       struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
        struct sockaddr_storage addr;
        struct sockaddr *sin = (struct sockaddr *) &addr;
-       struct svc_serv *serv = svsk->sk_server;
+       struct svc_serv *serv = svsk->sk_xprt.xpt_server;
        struct socket   *sock = svsk->sk_sock;
        struct socket   *newsock;
        struct svc_sock *newsvsk;
@@ -1031,9 +720,9 @@ svc_tcp_accept(struct svc_sock *svsk)
 
        dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
        if (!sock)
-               return;
+               return NULL;
 
-       clear_bit(SK_CONN, &svsk->sk_flags);
+       clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
        err = kernel_accept(sock, &newsock, O_NONBLOCK);
        if (err < 0) {
                if (err == -ENOMEM)
@@ -1042,11 +731,9 @@ svc_tcp_accept(struct svc_sock *svsk)
                else if (err != -EAGAIN && net_ratelimit())
                        printk(KERN_WARNING "%s: accept failed (err %d)!\n",
                                   serv->sv_name, -err);
-               return;
+               return NULL;
        }
-
-       set_bit(SK_CONN, &svsk->sk_flags);
-       svc_sock_enqueue(svsk);
+       set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
 
        err = kernel_getpeername(newsock, sin, &slen);
        if (err < 0) {
@@ -1077,106 +764,42 @@ svc_tcp_accept(struct svc_sock *svsk)
        if (!(newsvsk = svc_setup_socket(serv, newsock, &err,
                                 (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY))))
                goto failed;
-       memcpy(&newsvsk->sk_remote, sin, slen);
-       newsvsk->sk_remotelen = slen;
+       svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen);
        err = kernel_getsockname(newsock, sin, &slen);
        if (unlikely(err < 0)) {
                dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err);
                slen = offsetof(struct sockaddr, sa_data);
        }
-       memcpy(&newsvsk->sk_local, sin, slen);
-
-       svc_sock_received(newsvsk);
-
-       /* make sure that we don't have too many active connections.
-        * If we have, something must be dropped.
-        *
-        * There's no point in trying to do random drop here for
-        * DoS prevention. The NFS clients does 1 reconnect in 15
-        * seconds. An attacker can easily beat that.
-        *
-        * The only somewhat efficient mechanism would be if drop
-        * old connections from the same IP first. But right now
-        * we don't even record the client IP in svc_sock.
-        */
-       if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
-               struct svc_sock *svsk = NULL;
-               spin_lock_bh(&serv->sv_lock);
-               if (!list_empty(&serv->sv_tempsocks)) {
-                       if (net_ratelimit()) {
-                               /* Try to help the admin */
-                               printk(KERN_NOTICE "%s: too many open TCP "
-                                       "sockets, consider increasing the "
-                                       "number of nfsd threads\n",
-                                                  serv->sv_name);
-                               printk(KERN_NOTICE
-                                      "%s: last TCP connect from %s\n",
-                                      serv->sv_name, __svc_print_addr(sin,
-                                                       buf, sizeof(buf)));
-                       }
-                       /*
-                        * Always select the oldest socket. It's not fair,
-                        * but so is life
-                        */
-                       svsk = list_entry(serv->sv_tempsocks.prev,
-                                         struct svc_sock,
-                                         sk_list);
-                       set_bit(SK_CLOSE, &svsk->sk_flags);
-                       atomic_inc(&svsk->sk_inuse);
-               }
-               spin_unlock_bh(&serv->sv_lock);
-
-               if (svsk) {
-                       svc_sock_enqueue(svsk);
-                       svc_sock_put(svsk);
-               }
-
-       }
+       svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
 
        if (serv->sv_stats)
                serv->sv_stats->nettcpconn++;
 
-       return;
+       return &newsvsk->sk_xprt;
 
 failed:
        sock_release(newsock);
-       return;
+       return NULL;
 }
 
 /*
  * Receive data from a TCP socket.
  */
-static int
-svc_tcp_recvfrom(struct svc_rqst *rqstp)
+static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 {
-       struct svc_sock *svsk = rqstp->rq_sock;
-       struct svc_serv *serv = svsk->sk_server;
+       struct svc_sock *svsk =
+               container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+       struct svc_serv *serv = svsk->sk_xprt.xpt_server;
        int             len;
        struct kvec *vec;
        int pnum, vlen;
 
        dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
-               svsk, test_bit(SK_DATA, &svsk->sk_flags),
-               test_bit(SK_CONN, &svsk->sk_flags),
-               test_bit(SK_CLOSE, &svsk->sk_flags));
+               svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
+               test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags),
+               test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
 
-       if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) {
-               svc_sock_received(svsk);
-               return svc_deferred_recv(rqstp);
-       }
-
-       if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
-               svc_delete_socket(svsk);
-               return 0;
-       }
-
-       if (svsk->sk_sk->sk_state == TCP_LISTEN) {
-               svc_tcp_accept(svsk);
-               svc_sock_received(svsk);
-               return 0;
-       }
-
-       if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
+       if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
                /* sndbuf needs to have room for one request
                 * per thread, otherwise we can stall even when the
                 * network isn't a bottleneck.
@@ -1193,7 +816,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
                                    (serv->sv_nrthreads+3) * serv->sv_max_mesg,
                                    3 * serv->sv_max_mesg);
 
-       clear_bit(SK_DATA, &svsk->sk_flags);
+       clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
        /* Receive data. If we haven't got the record length yet, get
         * the next four bytes. Otherwise try to gobble up as much as
@@ -1212,7 +835,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
                if (len < want) {
                        dprintk("svc: short recvfrom while reading record length (%d of %lu)\n",
                                len, want);
-                       svc_sock_received(svsk);
+                       svc_xprt_received(&svsk->sk_xprt);
                        return -EAGAIN; /* record header not complete */
                }
 
@@ -1248,11 +871,11 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
        if (len < svsk->sk_reclen) {
                dprintk("svc: incomplete TCP record (%d of %d)\n",
                        len, svsk->sk_reclen);
-               svc_sock_received(svsk);
+               svc_xprt_received(&svsk->sk_xprt);
                return -EAGAIN; /* record not complete */
        }
        len = svsk->sk_reclen;
-       set_bit(SK_DATA, &svsk->sk_flags);
+       set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
        vec = rqstp->rq_vec;
        vec[0] = rqstp->rq_arg.head[0];
@@ -1281,30 +904,31 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
                rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
        }
 
-       rqstp->rq_skbuff      = NULL;
+       rqstp->rq_xprt_ctxt   = NULL;
        rqstp->rq_prot        = IPPROTO_TCP;
 
        /* Reset TCP read info */
        svsk->sk_reclen = 0;
        svsk->sk_tcplen = 0;
 
-       svc_sock_received(svsk);
+       svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt);
+       svc_xprt_received(&svsk->sk_xprt);
        if (serv->sv_stats)
                serv->sv_stats->nettcpcnt++;
 
        return len;
 
  err_delete:
-       svc_delete_socket(svsk);
+       set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
        return -EAGAIN;
 
  error:
        if (len == -EAGAIN) {
                dprintk("RPC: TCP recvfrom got EAGAIN\n");
-               svc_sock_received(svsk);
+               svc_xprt_received(&svsk->sk_xprt);
        } else {
                printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
-                                       svsk->sk_server->sv_name, -len);
+                      svsk->sk_xprt.xpt_server->sv_name, -len);
                goto err_delete;
        }
 
@@ -1314,8 +938,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
 /*
  * Send out data on TCP socket.
  */
-static int
-svc_tcp_sendto(struct svc_rqst *rqstp)
+static int svc_tcp_sendto(struct svc_rqst *rqstp)
 {
        struct xdr_buf  *xbufp = &rqstp->rq_res;
        int sent;
@@ -1328,35 +951,109 @@ svc_tcp_sendto(struct svc_rqst *rqstp)
        reclen = htonl(0x80000000|((xbufp->len ) - 4));
        memcpy(xbufp->head[0].iov_base, &reclen, 4);
 
-       if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags))
+       if (test_bit(XPT_DEAD, &rqstp->rq_xprt->xpt_flags))
                return -ENOTCONN;
 
        sent = svc_sendto(rqstp, &rqstp->rq_res);
        if (sent != xbufp->len) {
-               printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n",
-                      rqstp->rq_sock->sk_server->sv_name,
+               printk(KERN_NOTICE
+                      "rpc-srv/tcp: %s: %s %d when sending %d bytes "
+                      "- shutting down socket\n",
+                      rqstp->rq_xprt->xpt_server->sv_name,
                       (sent<0)?"got error":"sent only",
                       sent, xbufp->len);
-               set_bit(SK_CLOSE, &rqstp->rq_sock->sk_flags);
-               svc_sock_enqueue(rqstp->rq_sock);
+               set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags);
+               svc_xprt_enqueue(rqstp->rq_xprt);
                sent = -EAGAIN;
        }
        return sent;
 }
 
-static void
-svc_tcp_init(struct svc_sock *svsk)
+/*
+ * Setup response header. TCP has a 4B record length field.
+ */
+static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
+{
+       struct kvec *resv = &rqstp->rq_res.head[0];
+
+       /* tcp needs a space for the record length... */
+       svc_putnl(resv, 0);
+}
+
+static int svc_tcp_has_wspace(struct svc_xprt *xprt)
+{
+       struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+       struct svc_serv *serv = svsk->sk_xprt.xpt_server;
+       int required;
+       int wspace;
+
+       /*
+        * Set the SOCK_NOSPACE flag before checking the available
+        * sock space.
+        */
+       set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+       required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
+       wspace = sk_stream_wspace(svsk->sk_sk);
+
+       if (wspace < sk_stream_min_wspace(svsk->sk_sk))
+               return 0;
+       if (required * 2 > wspace)
+               return 0;
+
+       clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+       return 1;
+}
+
+static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
+                                      struct sockaddr *sa, int salen,
+                                      int flags)
+{
+       return svc_create_socket(serv, IPPROTO_TCP, sa, salen, flags);
+}
+
+static struct svc_xprt_ops svc_tcp_ops = {
+       .xpo_create = svc_tcp_create,
+       .xpo_recvfrom = svc_tcp_recvfrom,
+       .xpo_sendto = svc_tcp_sendto,
+       .xpo_release_rqst = svc_release_skb,
+       .xpo_detach = svc_sock_detach,
+       .xpo_free = svc_sock_free,
+       .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
+       .xpo_has_wspace = svc_tcp_has_wspace,
+       .xpo_accept = svc_tcp_accept,
+};
+
+static struct svc_xprt_class svc_tcp_class = {
+       .xcl_name = "tcp",
+       .xcl_owner = THIS_MODULE,
+       .xcl_ops = &svc_tcp_ops,
+       .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
+};
+
+void svc_init_xprt_sock(void)
+{
+       svc_reg_xprt_class(&svc_tcp_class);
+       svc_reg_xprt_class(&svc_udp_class);
+}
+
+void svc_cleanup_xprt_sock(void)
+{
+       svc_unreg_xprt_class(&svc_tcp_class);
+       svc_unreg_xprt_class(&svc_udp_class);
+}
+
+static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 {
        struct sock     *sk = svsk->sk_sk;
        struct tcp_sock *tp = tcp_sk(sk);
 
-       svsk->sk_recvfrom = svc_tcp_recvfrom;
-       svsk->sk_sendto = svc_tcp_sendto;
-
+       svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv);
+       set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
        if (sk->sk_state == TCP_LISTEN) {
                dprintk("setting up TCP socket for listening\n");
+               set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
                sk->sk_data_ready = svc_tcp_listen_data_ready;
-               set_bit(SK_CONN, &svsk->sk_flags);
+               set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
        } else {
                dprintk("setting up TCP socket for reading\n");
                sk->sk_state_change = svc_tcp_state_change;
@@ -1373,18 +1070,17 @@ svc_tcp_init(struct svc_sock *svsk)
                 * svc_tcp_recvfrom will re-adjust if necessary
                 */
                svc_sock_setbufsize(svsk->sk_sock,
-                                   3 * svsk->sk_server->sv_max_mesg,
-                                   3 * svsk->sk_server->sv_max_mesg);
+                                   3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
+                                   3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
 
-               set_bit(SK_CHNGBUF, &svsk->sk_flags);
-               set_bit(SK_DATA, &svsk->sk_flags);
+               set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
+               set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
                if (sk->sk_state != TCP_ESTABLISHED)
-                       set_bit(SK_CLOSE, &svsk->sk_flags);
+                       set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
        }
 }
 
-void
-svc_sock_update_bufs(struct svc_serv *serv)
+void svc_sock_update_bufs(struct svc_serv *serv)
 {
        /*
         * The number of server threads has changed. Update
@@ -1395,231 +1091,17 @@ svc_sock_update_bufs(struct svc_serv *serv)
        spin_lock_bh(&serv->sv_lock);
        list_for_each(le, &serv->sv_permsocks) {
                struct svc_sock *svsk =
-                       list_entry(le, struct svc_sock, sk_list);
-               set_bit(SK_CHNGBUF, &svsk->sk_flags);
+                       list_entry(le, struct svc_sock, sk_xprt.xpt_list);
+               set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
        }
        list_for_each(le, &serv->sv_tempsocks) {
                struct svc_sock *svsk =
-                       list_entry(le, struct svc_sock, sk_list);
-               set_bit(SK_CHNGBUF, &svsk->sk_flags);
+                       list_entry(le, struct svc_sock, sk_xprt.xpt_list);
+               set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
        }
        spin_unlock_bh(&serv->sv_lock);
 }
 
-/*
- * Receive the next request on any socket.  This code is carefully
- * organised not to touch any cachelines in the shared svc_serv
- * structure, only cachelines in the local svc_pool.
- */
-int
-svc_recv(struct svc_rqst *rqstp, long timeout)
-{
-       struct svc_sock         *svsk = NULL;
-       struct svc_serv         *serv = rqstp->rq_server;
-       struct svc_pool         *pool = rqstp->rq_pool;
-       int                     len, i;
-       int                     pages;
-       struct xdr_buf          *arg;
-       DECLARE_WAITQUEUE(wait, current);
-
-       dprintk("svc: server %p waiting for data (to = %ld)\n",
-               rqstp, timeout);
-
-       if (rqstp->rq_sock)
-               printk(KERN_ERR
-                       "svc_recv: service %p, socket not NULL!\n",
-                        rqstp);
-       if (waitqueue_active(&rqstp->rq_wait))
-               printk(KERN_ERR
-                       "svc_recv: service %p, wait queue active!\n",
-                        rqstp);
-
-
-       /* now allocate needed pages.  If we get a failure, sleep briefly */
-       pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
-       for (i=0; i < pages ; i++)
-               while (rqstp->rq_pages[i] == NULL) {
-                       struct page *p = alloc_page(GFP_KERNEL);
-                       if (!p)
-                               schedule_timeout_uninterruptible(msecs_to_jiffies(500));
-                       rqstp->rq_pages[i] = p;
-               }
-       rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
-       BUG_ON(pages >= RPCSVC_MAXPAGES);
-
-       /* Make arg->head point to first page and arg->pages point to rest */
-       arg = &rqstp->rq_arg;
-       arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
-       arg->head[0].iov_len = PAGE_SIZE;
-       arg->pages = rqstp->rq_pages + 1;
-       arg->page_base = 0;
-       /* save at least one page for response */
-       arg->page_len = (pages-2)*PAGE_SIZE;
-       arg->len = (pages-1)*PAGE_SIZE;
-       arg->tail[0].iov_len = 0;
-
-       try_to_freeze();
-       cond_resched();
-       if (signalled())
-               return -EINTR;
-
-       spin_lock_bh(&pool->sp_lock);
-       if ((svsk = svc_sock_dequeue(pool)) != NULL) {
-               rqstp->rq_sock = svsk;
-               atomic_inc(&svsk->sk_inuse);
-               rqstp->rq_reserved = serv->sv_max_mesg;
-               atomic_add(rqstp->rq_reserved, &svsk->sk_reserved);
-       } else {
-               /* No data pending. Go to sleep */
-               svc_thread_enqueue(pool, rqstp);
-
-               /*
-                * We have to be able to interrupt this wait
-                * to bring down the daemons ...
-                */
-               set_current_state(TASK_INTERRUPTIBLE);
-               add_wait_queue(&rqstp->rq_wait, &wait);
-               spin_unlock_bh(&pool->sp_lock);
-
-               schedule_timeout(timeout);
-
-               try_to_freeze();
-
-               spin_lock_bh(&pool->sp_lock);
-               remove_wait_queue(&rqstp->rq_wait, &wait);
-
-               if (!(svsk = rqstp->rq_sock)) {
-                       svc_thread_dequeue(pool, rqstp);
-                       spin_unlock_bh(&pool->sp_lock);
-                       dprintk("svc: server %p, no data yet\n", rqstp);
-                       return signalled()? -EINTR : -EAGAIN;
-               }
-       }
-       spin_unlock_bh(&pool->sp_lock);
-
-       dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n",
-                rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse));
-       len = svsk->sk_recvfrom(rqstp);
-       dprintk("svc: got len=%d\n", len);
-
-       /* No data, incomplete (TCP) read, or accept() */
-       if (len == 0 || len == -EAGAIN) {
-               rqstp->rq_res.len = 0;
-               svc_sock_release(rqstp);
-               return -EAGAIN;
-       }
-       svsk->sk_lastrecv = get_seconds();
-       clear_bit(SK_OLD, &svsk->sk_flags);
-
-       rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
-       rqstp->rq_chandle.defer = svc_defer;
-
-       if (serv->sv_stats)
-               serv->sv_stats->netcnt++;
-       return len;
-}
-
-/*
- * Drop request
- */
-void
-svc_drop(struct svc_rqst *rqstp)
-{
-       dprintk("svc: socket %p dropped request\n", rqstp->rq_sock);
-       svc_sock_release(rqstp);
-}
-
-/*
- * Return reply to client.
- */
-int
-svc_send(struct svc_rqst *rqstp)
-{
-       struct svc_sock *svsk;
-       int             len;
-       struct xdr_buf  *xb;
-
-       if ((svsk = rqstp->rq_sock) == NULL) {
-               printk(KERN_WARNING "NULL socket pointer in %s:%d\n",
-                               __FILE__, __LINE__);
-               return -EFAULT;
-       }
-
-       /* release the receive skb before sending the reply */
-       svc_release_skb(rqstp);
-
-       /* calculate over-all length */
-       xb = & rqstp->rq_res;
-       xb->len = xb->head[0].iov_len +
-               xb->page_len +
-               xb->tail[0].iov_len;
-
-       /* Grab svsk->sk_mutex to serialize outgoing data. */
-       mutex_lock(&svsk->sk_mutex);
-       if (test_bit(SK_DEAD, &svsk->sk_flags))
-               len = -ENOTCONN;
-       else
-               len = svsk->sk_sendto(rqstp);
-       mutex_unlock(&svsk->sk_mutex);
-       svc_sock_release(rqstp);
-
-       if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
-               return 0;
-       return len;
-}
-
-/*
- * Timer function to close old temporary sockets, using
- * a mark-and-sweep algorithm.
- */
-static void
-svc_age_temp_sockets(unsigned long closure)
-{
-       struct svc_serv *serv = (struct svc_serv *)closure;
-       struct svc_sock *svsk;
-       struct list_head *le, *next;
-       LIST_HEAD(to_be_aged);
-
-       dprintk("svc_age_temp_sockets\n");
-
-       if (!spin_trylock_bh(&serv->sv_lock)) {
-               /* busy, try again 1 sec later */
-               dprintk("svc_age_temp_sockets: busy\n");
-               mod_timer(&serv->sv_temptimer, jiffies + HZ);
-               return;
-       }
-
-       list_for_each_safe(le, next, &serv->sv_tempsocks) {
-               svsk = list_entry(le, struct svc_sock, sk_list);
-
-               if (!test_and_set_bit(SK_OLD, &svsk->sk_flags))
-                       continue;
-               if (atomic_read(&svsk->sk_inuse) > 1 || test_bit(SK_BUSY, &svsk->sk_flags))
-                       continue;
-               atomic_inc(&svsk->sk_inuse);
-               list_move(le, &to_be_aged);
-               set_bit(SK_CLOSE, &svsk->sk_flags);
-               set_bit(SK_DETACHED, &svsk->sk_flags);
-       }
-       spin_unlock_bh(&serv->sv_lock);
-
-       while (!list_empty(&to_be_aged)) {
-               le = to_be_aged.next;
-               /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */
-               list_del_init(le);
-               svsk = list_entry(le, struct svc_sock, sk_list);
-
-               dprintk("queuing svsk %p for closing, %lu seconds old\n",
-                       svsk, get_seconds() - svsk->sk_lastrecv);
-
-               /* a thread will dequeue and close it soon */
-               svc_sock_enqueue(svsk);
-               svc_sock_put(svsk);
-       }
-
-       mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
-}
-
 /*
  * Initialize socket for RPC use and create svc_sock struct
  * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF.
@@ -1631,7 +1113,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
        struct svc_sock *svsk;
        struct sock     *inet;
        int             pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
-       int             is_temporary = flags & SVC_SOCK_TEMPORARY;
 
        dprintk("svc: svc_setup_socket %p\n", sock);
        if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
@@ -1651,44 +1132,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
                return NULL;
        }
 
-       set_bit(SK_BUSY, &svsk->sk_flags);
        inet->sk_user_data = svsk;
        svsk->sk_sock = sock;
        svsk->sk_sk = inet;
        svsk->sk_ostate = inet->sk_state_change;
        svsk->sk_odata = inet->sk_data_ready;
        svsk->sk_owspace = inet->sk_write_space;
-       svsk->sk_server = serv;
-       atomic_set(&svsk->sk_inuse, 1);
-       svsk->sk_lastrecv = get_seconds();
-       spin_lock_init(&svsk->sk_lock);
-       INIT_LIST_HEAD(&svsk->sk_deferred);
-       INIT_LIST_HEAD(&svsk->sk_ready);
-       mutex_init(&svsk->sk_mutex);
 
        /* Initialize the socket */
        if (sock->type == SOCK_DGRAM)
-               svc_udp_init(svsk);
+               svc_udp_init(svsk, serv);
        else
-               svc_tcp_init(svsk);
-
-       spin_lock_bh(&serv->sv_lock);
-       if (is_temporary) {
-               set_bit(SK_TEMP, &svsk->sk_flags);
-               list_add(&svsk->sk_list, &serv->sv_tempsocks);
-               serv->sv_tmpcnt++;
-               if (serv->sv_temptimer.function == NULL) {
-                       /* setup timer to age temp sockets */
-                       setup_timer(&serv->sv_temptimer, svc_age_temp_sockets,
-                                       (unsigned long)serv);
-                       mod_timer(&serv->sv_temptimer,
-                                       jiffies + svc_conn_age_period * HZ);
-               }
-       } else {
-               clear_bit(SK_TEMP, &svsk->sk_flags);
-               list_add(&svsk->sk_list, &serv->sv_permsocks);
-       }
-       spin_unlock_bh(&serv->sv_lock);
+               svc_tcp_init(svsk, serv);
 
        dprintk("svc: svc_setup_socket created %p (inet %p)\n",
                                svsk, svsk->sk_sk);
@@ -1717,7 +1172,16 @@ int svc_addsock(struct svc_serv *serv,
        else {
                svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS);
                if (svsk) {
-                       svc_sock_received(svsk);
+                       struct sockaddr_storage addr;
+                       struct sockaddr *sin = (struct sockaddr *)&addr;
+                       int salen;
+                       if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0)
+                               svc_xprt_set_local(&svsk->sk_xprt, sin, salen);
+                       clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags);
+                       spin_lock_bh(&serv->sv_lock);
+                       list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks);
+                       spin_unlock_bh(&serv->sv_lock);
+                       svc_xprt_received(&svsk->sk_xprt);
                        err = 0;
                }
        }
@@ -1733,14 +1197,19 @@ EXPORT_SYMBOL_GPL(svc_addsock);
 /*
  * Create socket for RPC service.
  */
-static int svc_create_socket(struct svc_serv *serv, int protocol,
-                               struct sockaddr *sin, int len, int flags)
+static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
+                                         int protocol,
+                                         struct sockaddr *sin, int len,
+                                         int flags)
 {
        struct svc_sock *svsk;
        struct socket   *sock;
        int             error;
        int             type;
        char            buf[RPC_MAX_ADDRBUFLEN];
+       struct sockaddr_storage addr;
+       struct sockaddr *newsin = (struct sockaddr *)&addr;
+       int             newlen;
 
        dprintk("svc: svc_create_socket(%s, %d, %s)\n",
                        serv->sv_program->pg_name, protocol,
@@ -1749,13 +1218,13 @@ static int svc_create_socket(struct svc_serv *serv, int protocol,
        if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) {
                printk(KERN_WARNING "svc: only UDP and TCP "
                                "sockets supported\n");
-               return -EINVAL;
+               return ERR_PTR(-EINVAL);
        }
        type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
 
        error = sock_create_kern(sin->sa_family, type, protocol, &sock);
        if (error < 0)
-               return error;
+               return ERR_PTR(error);
 
        svc_reclassify_socket(sock);
 
@@ -1765,203 +1234,55 @@ static int svc_create_socket(struct svc_serv *serv, int protocol,
        if (error < 0)
                goto bummer;
 
+       newlen = len;
+       error = kernel_getsockname(sock, newsin, &newlen);
+       if (error < 0)
+               goto bummer;
+
        if (protocol == IPPROTO_TCP) {
                if ((error = kernel_listen(sock, 64)) < 0)
                        goto bummer;
        }
 
        if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) {
-               svc_sock_received(svsk);
-               return ntohs(inet_sk(svsk->sk_sk)->sport);
+               svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen);
+               return (struct svc_xprt *)svsk;
        }
 
 bummer:
        dprintk("svc: svc_create_socket error = %d\n", -error);
        sock_release(sock);
-       return error;
+       return ERR_PTR(error);
 }
 
 /*
- * Remove a dead socket
+ * Detach the svc_sock from the socket so that no
+ * more callbacks occur.
  */
-static void
-svc_delete_socket(struct svc_sock *svsk)
+static void svc_sock_detach(struct svc_xprt *xprt)
 {
-       struct svc_serv *serv;
-       struct sock     *sk;
-
-       dprintk("svc: svc_delete_socket(%p)\n", svsk);
+       struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+       struct sock *sk = svsk->sk_sk;
 
-       serv = svsk->sk_server;
-       sk = svsk->sk_sk;
+       dprintk("svc: svc_sock_detach(%p)\n", svsk);
 
+       /* put back the old socket callbacks */
        sk->sk_state_change = svsk->sk_ostate;
        sk->sk_data_ready = svsk->sk_odata;
        sk->sk_write_space = svsk->sk_owspace;
-
-       spin_lock_bh(&serv->sv_lock);
-
-       if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags))
-               list_del_init(&svsk->sk_list);
-       /*
-        * We used to delete the svc_sock from whichever list
-        * it's sk_ready node was on, but we don't actually
-        * need to.  This is because the only time we're called
-        * while still attached to a queue, the queue itself
-        * is about to be destroyed (in svc_destroy).
-        */
-       if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) {
-               BUG_ON(atomic_read(&svsk->sk_inuse)<2);
-               atomic_dec(&svsk->sk_inuse);
-               if (test_bit(SK_TEMP, &svsk->sk_flags))
-                       serv->sv_tmpcnt--;
-       }
-
-       spin_unlock_bh(&serv->sv_lock);
-}
-
-static void svc_close_socket(struct svc_sock *svsk)
-{
-       set_bit(SK_CLOSE, &svsk->sk_flags);
-       if (test_and_set_bit(SK_BUSY, &svsk->sk_flags))
-               /* someone else will have to effect the close */
-               return;
-
-       atomic_inc(&svsk->sk_inuse);
-       svc_delete_socket(svsk);
-       clear_bit(SK_BUSY, &svsk->sk_flags);
-       svc_sock_put(svsk);
-}
-
-void svc_force_close_socket(struct svc_sock *svsk)
-{
-       set_bit(SK_CLOSE, &svsk->sk_flags);
-       if (test_bit(SK_BUSY, &svsk->sk_flags)) {
-               /* Waiting to be processed, but no threads left,
-                * So just remove it from the waiting list
-                */
-               list_del_init(&svsk->sk_ready);
-               clear_bit(SK_BUSY, &svsk->sk_flags);
-       }
-       svc_close_socket(svsk);
-}
-
-/**
- * svc_makesock - Make a socket for nfsd and lockd
- * @serv: RPC server structure
- * @protocol: transport protocol to use
- * @port: port to use
- * @flags: requested socket characteristics
- *
- */
-int svc_makesock(struct svc_serv *serv, int protocol, unsigned short port,
-                       int flags)
-{
-       struct sockaddr_in sin = {
-               .sin_family             = AF_INET,
-               .sin_addr.s_addr        = INADDR_ANY,
-               .sin_port               = htons(port),
-       };
-
-       dprintk("svc: creating socket proto = %d\n", protocol);
-       return svc_create_socket(serv, protocol, (struct sockaddr *) &sin,
-                                                       sizeof(sin), flags);
 }
 
 /*
- * Handle defer and revisit of requests
+ * Free the svc_sock's socket resources and the svc_sock itself.
  */
-
-static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
+static void svc_sock_free(struct svc_xprt *xprt)
 {
-       struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle);
-       struct svc_sock *svsk;
+       struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+       dprintk("svc: svc_sock_free(%p)\n", svsk);
 
-       if (too_many) {
-               svc_sock_put(dr->svsk);
-               kfree(dr);
-               return;
-       }
-       dprintk("revisit queued\n");
-       svsk = dr->svsk;
-       dr->svsk = NULL;
-       spin_lock(&svsk->sk_lock);
-       list_add(&dr->handle.recent, &svsk->sk_deferred);
-       spin_unlock(&svsk->sk_lock);
-       set_bit(SK_DEFERRED, &svsk->sk_flags);
-       svc_sock_enqueue(svsk);
-       svc_sock_put(svsk);
-}
-
-static struct cache_deferred_req *
-svc_defer(struct cache_req *req)
-{
-       struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
-       int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len);
-       struct svc_deferred_req *dr;
-
-       if (rqstp->rq_arg.page_len)
-               return NULL; /* if more than a page, give up FIXME */
-       if (rqstp->rq_deferred) {
-               dr = rqstp->rq_deferred;
-               rqstp->rq_deferred = NULL;
-       } else {
-               int skip  = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
-               /* FIXME maybe discard if size too large */
-               dr = kmalloc(size, GFP_KERNEL);
-               if (dr == NULL)
-                       return NULL;
-
-               dr->handle.owner = rqstp->rq_server;
-               dr->prot = rqstp->rq_prot;
-               memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen);
-               dr->addrlen = rqstp->rq_addrlen;
-               dr->daddr = rqstp->rq_daddr;
-               dr->argslen = rqstp->rq_arg.len >> 2;
-               memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2);
-       }
-       atomic_inc(&rqstp->rq_sock->sk_inuse);
-       dr->svsk = rqstp->rq_sock;
-
-       dr->handle.revisit = svc_revisit;
-       return &dr->handle;
-}
-
-/*
- * recv data from a deferred request into an active one
- */
-static int svc_deferred_recv(struct svc_rqst *rqstp)
-{
-       struct svc_deferred_req *dr = rqstp->rq_deferred;
-
-       rqstp->rq_arg.head[0].iov_base = dr->args;
-       rqstp->rq_arg.head[0].iov_len = dr->argslen<<2;
-       rqstp->rq_arg.page_len = 0;
-       rqstp->rq_arg.len = dr->argslen<<2;
-       rqstp->rq_prot        = dr->prot;
-       memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen);
-       rqstp->rq_addrlen     = dr->addrlen;
-       rqstp->rq_daddr       = dr->daddr;
-       rqstp->rq_respages    = rqstp->rq_pages;
-       return dr->argslen<<2;
-}
-
-
-static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk)
-{
-       struct svc_deferred_req *dr = NULL;
-
-       if (!test_bit(SK_DEFERRED, &svsk->sk_flags))
-               return NULL;
-       spin_lock(&svsk->sk_lock);
-       clear_bit(SK_DEFERRED, &svsk->sk_flags);
-       if (!list_empty(&svsk->sk_deferred)) {
-               dr = list_entry(svsk->sk_deferred.next,
-                               struct svc_deferred_req,
-                               handle.recent);
-               list_del_init(&dr->handle.recent);
-               set_bit(SK_DEFERRED, &svsk->sk_flags);
-       }
-       spin_unlock(&svsk->sk_lock);
-       return dr;
+       if (svsk->sk_sock->file)
+               sockfd_put(svsk->sk_sock);
+       else
+               sock_release(svsk->sk_sock);
+       kfree(svsk);
 }
index bada7de0c2fcd6a216f44817cb67e03f43b736b5..0f8c439b848a1eab04f0f21f2ea763863e424b2f 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/sched.h>
 #include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svc_xprt.h>
 
 /*
  * Declare the debug flags here
@@ -55,6 +56,30 @@ rpc_unregister_sysctl(void)
        }
 }
 
+static int proc_do_xprt(ctl_table *table, int write, struct file *file,
+                       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       char tmpbuf[256];
+       int len;
+       if ((*ppos && !write) || !*lenp) {
+               *lenp = 0;
+               return 0;
+       }
+       if (write)
+               return -EINVAL;
+       else {
+               len = svc_print_xprts(tmpbuf, sizeof(tmpbuf));
+               if (!access_ok(VERIFY_WRITE, buffer, len))
+                       return -EFAULT;
+
+               if (__copy_to_user(buffer, tmpbuf, len))
+                       return -EFAULT;
+       }
+       *lenp -= len;
+       *ppos += len;
+       return 0;
+}
+
 static int
 proc_dodebug(ctl_table *table, int write, struct file *file,
                                void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -147,6 +172,12 @@ static ctl_table debug_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dodebug
        },
+       {
+               .procname       = "transports",
+               .maxlen         = 256,
+               .mode           = 0444,
+               .proc_handler   = &proc_do_xprt,
+       },
        { .ctl_name = 0 }
 };
 
index 54264062ea695d59f85da1d712f53cd0ff45952a..995c3fdc16c27821d2e97887a6bdba002a086321 100644 (file)
@@ -96,11 +96,13 @@ xdr_encode_string(__be32 *p, const char *string)
 EXPORT_SYMBOL(xdr_encode_string);
 
 __be32 *
-xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen)
+xdr_decode_string_inplace(__be32 *p, char **sp,
+                         unsigned int *lenp, unsigned int maxlen)
 {
-       unsigned int    len;
+       u32 len;
 
-       if ((len = ntohl(*p++)) > maxlen)
+       len = ntohl(*p++);
+       if (len > maxlen)
                return NULL;
        *lenp = len;
        *sp = (char *) p;
index 264f0feeb513e24b8d4b3cfef9e6850b1a0fcff1..5a8f268bdd30c5dedfcb73dc6b071c1263553875 100644 (file)
@@ -1,3 +1,8 @@
 obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o
 
 xprtrdma-y := transport.o rpc_rdma.o verbs.o
+
+obj-$(CONFIG_SUNRPC_XPRT_RDMA) += svcrdma.o
+
+svcrdma-y := svc_rdma.o svc_rdma_transport.o \
+       svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
new file mode 100644 (file)
index 0000000..88c0ca2
--- /dev/null
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY        RPCDBG_SVCXPRT
+
+/* RPC/RDMA parameters */
+unsigned int svcrdma_ord = RPCRDMA_ORD;
+static unsigned int min_ord = 1;
+static unsigned int max_ord = 4096;
+unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
+static unsigned int min_max_requests = 4;
+static unsigned int max_max_requests = 16384;
+unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
+static unsigned int min_max_inline = 4096;
+static unsigned int max_max_inline = 65536;
+
+atomic_t rdma_stat_recv;
+atomic_t rdma_stat_read;
+atomic_t rdma_stat_write;
+atomic_t rdma_stat_sq_starve;
+atomic_t rdma_stat_rq_starve;
+atomic_t rdma_stat_rq_poll;
+atomic_t rdma_stat_rq_prod;
+atomic_t rdma_stat_sq_poll;
+atomic_t rdma_stat_sq_prod;
+
+/*
+ * This function implements reading and resetting an atomic_t stat
+ * variable through read/write to a proc file. Any write to the file
+ * resets the associated statistic to zero. Any read returns it's
+ * current value.
+ */
+static int read_reset_stat(ctl_table *table, int write,
+                          struct file *filp, void __user *buffer, size_t *lenp,
+                          loff_t *ppos)
+{
+       atomic_t *stat = (atomic_t *)table->data;
+
+       if (!stat)
+               return -EINVAL;
+
+       if (write)
+               atomic_set(stat, 0);
+       else {
+               char str_buf[32];
+               char *data;
+               int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat));
+               if (len >= 32)
+                       return -EFAULT;
+               len = strlen(str_buf);
+               if (*ppos > len) {
+                       *lenp = 0;
+                       return 0;
+               }
+               data = &str_buf[*ppos];
+               len -= *ppos;
+               if (len > *lenp)
+                       len = *lenp;
+               if (len && copy_to_user(buffer, str_buf, len))
+                       return -EFAULT;
+               *lenp = len;
+               *ppos += len;
+       }
+       return 0;
+}
+
+static struct ctl_table_header *svcrdma_table_header;
+static ctl_table svcrdma_parm_table[] = {
+       {
+               .procname       = "max_requests",
+               .data           = &svcrdma_max_requests,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_minmax,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &min_max_requests,
+               .extra2         = &max_max_requests
+       },
+       {
+               .procname       = "max_req_size",
+               .data           = &svcrdma_max_req_size,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_minmax,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &min_max_inline,
+               .extra2         = &max_max_inline
+       },
+       {
+               .procname       = "max_outbound_read_requests",
+               .data           = &svcrdma_ord,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_minmax,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &min_ord,
+               .extra2         = &max_ord,
+       },
+
+       {
+               .procname       = "rdma_stat_read",
+               .data           = &rdma_stat_read,
+               .maxlen         = sizeof(atomic_t),
+               .mode           = 0644,
+               .proc_handler   = &read_reset_stat,
+       },
+       {
+               .procname       = "rdma_stat_recv",
+               .data           = &rdma_stat_recv,
+               .maxlen         = sizeof(atomic_t),
+               .mode           = 0644,
+               .proc_handler   = &read_reset_stat,
+       },
+       {
+               .procname       = "rdma_stat_write",
+               .data           = &rdma_stat_write,
+               .maxlen         = sizeof(atomic_t),
+               .mode           = 0644,
+               .proc_handler   = &read_reset_stat,
+       },
+       {
+               .procname       = "rdma_stat_sq_starve",
+               .data           = &rdma_stat_sq_starve,
+               .maxlen         = sizeof(atomic_t),
+               .mode           = 0644,
+               .proc_handler   = &read_reset_stat,
+       },
+       {
+               .procname       = "rdma_stat_rq_starve",
+               .data           = &rdma_stat_rq_starve,
+               .maxlen         = sizeof(atomic_t),
+               .mode           = 0644,
+               .proc_handler   = &read_reset_stat,
+       },
+       {
+               .procname       = "rdma_stat_rq_poll",
+               .data           = &rdma_stat_rq_poll,
+               .maxlen         = sizeof(atomic_t),
+               .mode           = 0644,
+               .proc_handler   = &read_reset_stat,
+       },
+       {
+               .procname       = "rdma_stat_rq_prod",
+               .data           = &rdma_stat_rq_prod,
+               .maxlen         = sizeof(atomic_t),
+               .mode           = 0644,
+               .proc_handler   = &read_reset_stat,
+       },
+       {
+               .procname       = "rdma_stat_sq_poll",
+               .data           = &rdma_stat_sq_poll,
+               .maxlen         = sizeof(atomic_t),
+               .mode           = 0644,
+               .proc_handler   = &read_reset_stat,
+       },
+       {
+               .procname       = "rdma_stat_sq_prod",
+               .data           = &rdma_stat_sq_prod,
+               .maxlen         = sizeof(atomic_t),
+               .mode           = 0644,
+               .proc_handler   = &read_reset_stat,
+       },
+       {
+               .ctl_name = 0,
+       },
+};
+
+static ctl_table svcrdma_table[] = {
+       {
+               .procname       = "svc_rdma",
+               .mode           = 0555,
+               .child          = svcrdma_parm_table
+       },
+       {
+               .ctl_name = 0,
+       },
+};
+
+static ctl_table svcrdma_root_table[] = {
+       {
+               .ctl_name       = CTL_SUNRPC,
+               .procname       = "sunrpc",
+               .mode           = 0555,
+               .child          = svcrdma_table
+       },
+       {
+               .ctl_name = 0,
+       },
+};
+
+void svc_rdma_cleanup(void)
+{
+       dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
+       if (svcrdma_table_header) {
+               unregister_sysctl_table(svcrdma_table_header);
+               svcrdma_table_header = NULL;
+       }
+       svc_unreg_xprt_class(&svc_rdma_class);
+}
+
+int svc_rdma_init(void)
+{
+       dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
+       dprintk("\tsvcrdma_ord      : %d\n", svcrdma_ord);
+       dprintk("\tmax_requests     : %d\n", svcrdma_max_requests);
+       dprintk("\tsq_depth         : %d\n",
+               svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
+       dprintk("\tmax_inline       : %d\n", svcrdma_max_req_size);
+       if (!svcrdma_table_header)
+               svcrdma_table_header =
+                       register_sysctl_table(svcrdma_root_table);
+
+       /* Register RDMA with the SVC transport switch */
+       svc_reg_xprt_class(&svc_rdma_class);
+       return 0;
+}
+MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
+MODULE_DESCRIPTION("SVC RDMA Transport");
+MODULE_LICENSE("Dual BSD/GPL");
+module_init(svc_rdma_init);
+module_exit(svc_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
new file mode 100644 (file)
index 0000000..9530ef2
--- /dev/null
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/debug.h>
+#include <asm/unaligned.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY        RPCDBG_SVCXPRT
+
+/*
+ * Decodes a read chunk list. The expected format is as follows:
+ *    descrim  : xdr_one
+ *    position : u32 offset into XDR stream
+ *    handle   : u32 RKEY
+ *    . . .
+ *  end-of-list: xdr_zero
+ */
+static u32 *decode_read_list(u32 *va, u32 *vaend)
+{
+       struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
+
+       while (ch->rc_discrim != xdr_zero) {
+               u64 ch_offset;
+
+               if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) >
+                   (unsigned long)vaend) {
+                       dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
+                       return NULL;
+               }
+
+               ch->rc_discrim = ntohl(ch->rc_discrim);
+               ch->rc_position = ntohl(ch->rc_position);
+               ch->rc_target.rs_handle = ntohl(ch->rc_target.rs_handle);
+               ch->rc_target.rs_length = ntohl(ch->rc_target.rs_length);
+               va = (u32 *)&ch->rc_target.rs_offset;
+               xdr_decode_hyper(va, &ch_offset);
+               put_unaligned(ch_offset, (u64 *)va);
+               ch++;
+       }
+       return (u32 *)&ch->rc_position;
+}
+
+/*
+ * Determine number of chunks and total bytes in chunk list. The chunk
+ * list has already been verified to fit within the RPCRDMA header.
+ */
+void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch,
+                              int *ch_count, int *byte_count)
+{
+       /* compute the number of bytes represented by read chunks */
+       *byte_count = 0;
+       *ch_count = 0;
+       for (; ch->rc_discrim != 0; ch++) {
+               *byte_count = *byte_count + ch->rc_target.rs_length;
+               *ch_count = *ch_count + 1;
+       }
+}
+
+/*
+ * Decodes a write chunk list. The expected format is as follows:
+ *    descrim  : xdr_one
+ *    nchunks  : <count>
+ *       handle   : u32 RKEY              ---+
+ *       length   : u32 <len of segment>     |
+ *       offset   : remove va                + <count>
+ *       . . .                               |
+ *                                        ---+
+ */
+static u32 *decode_write_list(u32 *va, u32 *vaend)
+{
+       int ch_no;
+       struct rpcrdma_write_array *ary =
+               (struct rpcrdma_write_array *)va;
+
+       /* Check for not write-array */
+       if (ary->wc_discrim == xdr_zero)
+               return (u32 *)&ary->wc_nchunks;
+
+       if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
+           (unsigned long)vaend) {
+               dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
+               return NULL;
+       }
+       ary->wc_discrim = ntohl(ary->wc_discrim);
+       ary->wc_nchunks = ntohl(ary->wc_nchunks);
+       if (((unsigned long)&ary->wc_array[0] +
+            (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) >
+           (unsigned long)vaend) {
+               dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
+                       ary, ary->wc_nchunks, vaend);
+               return NULL;
+       }
+       for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) {
+               u64 ch_offset;
+
+               ary->wc_array[ch_no].wc_target.rs_handle =
+                       ntohl(ary->wc_array[ch_no].wc_target.rs_handle);
+               ary->wc_array[ch_no].wc_target.rs_length =
+                       ntohl(ary->wc_array[ch_no].wc_target.rs_length);
+               va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset;
+               xdr_decode_hyper(va, &ch_offset);
+               put_unaligned(ch_offset, (u64 *)va);
+       }
+
+       /*
+        * rs_length is the 2nd 4B field in wc_target and taking its
+        * address skips the list terminator
+        */
+       return (u32 *)&ary->wc_array[ch_no].wc_target.rs_length;
+}
+
+static u32 *decode_reply_array(u32 *va, u32 *vaend)
+{
+       int ch_no;
+       struct rpcrdma_write_array *ary =
+               (struct rpcrdma_write_array *)va;
+
+       /* Check for no reply-array */
+       if (ary->wc_discrim == xdr_zero)
+               return (u32 *)&ary->wc_nchunks;
+
+       if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
+           (unsigned long)vaend) {
+               dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
+               return NULL;
+       }
+       ary->wc_discrim = ntohl(ary->wc_discrim);
+       ary->wc_nchunks = ntohl(ary->wc_nchunks);
+       if (((unsigned long)&ary->wc_array[0] +
+            (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) >
+           (unsigned long)vaend) {
+               dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
+                       ary, ary->wc_nchunks, vaend);
+               return NULL;
+       }
+       for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) {
+               u64 ch_offset;
+
+               ary->wc_array[ch_no].wc_target.rs_handle =
+                       ntohl(ary->wc_array[ch_no].wc_target.rs_handle);
+               ary->wc_array[ch_no].wc_target.rs_length =
+                       ntohl(ary->wc_array[ch_no].wc_target.rs_length);
+               va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset;
+               xdr_decode_hyper(va, &ch_offset);
+               put_unaligned(ch_offset, (u64 *)va);
+       }
+
+       return (u32 *)&ary->wc_array[ch_no];
+}
+
+int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
+                           struct svc_rqst *rqstp)
+{
+       struct rpcrdma_msg *rmsgp = NULL;
+       u32 *va;
+       u32 *vaend;
+       u32 hdr_len;
+
+       rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+
+       /* Verify that there's enough bytes for header + something */
+       if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) {
+               dprintk("svcrdma: header too short = %d\n",
+                       rqstp->rq_arg.len);
+               return -EINVAL;
+       }
+
+       /* Decode the header */
+       rmsgp->rm_xid = ntohl(rmsgp->rm_xid);
+       rmsgp->rm_vers = ntohl(rmsgp->rm_vers);
+       rmsgp->rm_credit = ntohl(rmsgp->rm_credit);
+       rmsgp->rm_type = ntohl(rmsgp->rm_type);
+
+       if (rmsgp->rm_vers != RPCRDMA_VERSION)
+               return -ENOSYS;
+
+       /* Pull in the extra for the padded case and bump our pointer */
+       if (rmsgp->rm_type == RDMA_MSGP) {
+               int hdrlen;
+               rmsgp->rm_body.rm_padded.rm_align =
+                       ntohl(rmsgp->rm_body.rm_padded.rm_align);
+               rmsgp->rm_body.rm_padded.rm_thresh =
+                       ntohl(rmsgp->rm_body.rm_padded.rm_thresh);
+
+               va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
+               rqstp->rq_arg.head[0].iov_base = va;
+               hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
+               rqstp->rq_arg.head[0].iov_len -= hdrlen;
+               if (hdrlen > rqstp->rq_arg.len)
+                       return -EINVAL;
+               return hdrlen;
+       }
+
+       /* The chunk list may contain either a read chunk list or a write
+        * chunk list and a reply chunk list.
+        */
+       va = &rmsgp->rm_body.rm_chunks[0];
+       vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
+       va = decode_read_list(va, vaend);
+       if (!va)
+               return -EINVAL;
+       va = decode_write_list(va, vaend);
+       if (!va)
+               return -EINVAL;
+       va = decode_reply_array(va, vaend);
+       if (!va)
+               return -EINVAL;
+
+       rqstp->rq_arg.head[0].iov_base = va;
+       hdr_len = (unsigned long)va - (unsigned long)rmsgp;