whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit c7a2c49ea6c9eebbe44ff2c08b663b2905ee2c13
parent 033078a9afe504ac9e615d10c4b35d634450b637
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Fri, 26 Oct 2018 13:05:26 -0700

Merge tag 'nfs-for-4.20-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs

Pull NFS client updates from Trond Myklebust:
 "Highlights include:

  Stable fixes:
   - Fix the NFSv4.1 r/wsize sanity checking
   - Reset the RPC/RDMA credit grant properly after a disconnect
   - Fix a missed page unlock after pg_doio()

  Features and optimisations:
   - Overhaul of the RPC client socket code to eliminate a locking
     bottleneck and reduce the latency when transmitting lots of
     requests in parallel.
   - Allow parallelisation of the RPCSEC_GSS encoding of an RPC request.
   - Convert the RPC client socket receive code to use iovec_iter() for
     improved efficiency.
   - Convert several NFS and RPC lookup operations to use RCU instead of
     taking global locks.
   - Avoid the need for BH-safe locks in the RPC/RDMA back channel.

  Bugfixes and cleanups:
   - Fix lock recovery during NFSv4 delegation recalls
   - Fix the NFSv4 + NFSv4.1 "lookup revalidate + open file" case.
   - Fixes for the RPC connection metrics
   - Various RPC client layer cleanups to consolidate stream based
     sockets
   - RPC/RDMA connection cleanups
   - Simplify the RPC/RDMA cleanup after memory operation failures
   - Clean ups for NFS v4.2 copy completion and NFSv4 open state
     reclaim"

* tag 'nfs-for-4.20-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (97 commits)
  SUNRPC: Convert the auth cred cache to use refcount_t
  SUNRPC: Convert auth creds to use refcount_t
  SUNRPC: Simplify lookup code
  SUNRPC: Clean up the AUTH cache code
  NFS: change sign of nfs_fh length
  sunrpc: safely reallow resvport min/max inversion
  nfs: remove redundant call to nfs_context_set_write_error()
  nfs: Fix a missed page unlock after pg_doio()
  SUNRPC: Fix a compile warning for cmpxchg64()
  NFSv4.x: fix lock recovery during delegation recall
  SUNRPC: use cmpxchg64() in gss_seq_send64_fetch_and_inc()
  xprtrdma: Squelch a sparse warning
  xprtrdma: Clean up xprt_rdma_disconnect_inject
  xprtrdma: Add documenting comments
  xprtrdma: Report when there were zero posted Receives
  xprtrdma: Move rb_flags initialization
  xprtrdma: Don't disable BH's in backchannel server
  xprtrdma: Remove memory address of "ep" from an error message
  xprtrdma: Rename rpcrdma_qp_async_error_upcall
  xprtrdma: Simplify RPC wake-ups on connect
  ...

Diffstat:
Mfs/nfs/delegation.c | 17+++++++++--------
Mfs/nfs/dir.c | 295+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Mfs/nfs/filelayout/filelayout.c | 1+
Mfs/nfs/flexfilelayout/flexfilelayout.c | 1+
Mfs/nfs/flexfilelayout/flexfilelayoutdev.c | 2+-
Mfs/nfs/inode.c | 70++++++++++++++++++++++++++++------------------------------------------
Mfs/nfs/nfs3proc.c | 5+++++
Mfs/nfs/nfs3xdr.c | 10++++++++--
Mfs/nfs/nfs4_fs.h | 3++-
Mfs/nfs/nfs4client.c | 16+++++++++-------
Mfs/nfs/nfs4proc.c | 53++++++++++++++++++++++++++++++++++++++++-------------
Mfs/nfs/nfs4state.c | 254++++++++++++++++++++++++++++++++++++++++++-------------------------------------
Mfs/nfs/nfs4xdr.c | 2+-
Mfs/nfs/pagelist.c | 49+++++++++++++++++++++++--------------------------
Mfs/nfs/pnfs.c | 16+++++++++++++---
Mfs/nfs/pnfs.h | 1+
Mfs/nfs/read.c | 10++++------
Minclude/linux/nfs_fs.h | 2++
Minclude/linux/nfs_fs_sb.h | 3+++
Minclude/linux/nfs_xdr.h | 28+++++++++++++++++++---------
Minclude/linux/sunrpc/auth.h | 18+++++++++---------
Minclude/linux/sunrpc/auth_gss.h | 1+
Minclude/linux/sunrpc/bc_xprt.h | 1+
Minclude/linux/sunrpc/gss_krb5.h | 3++-
Minclude/linux/sunrpc/sched.h | 10++++++----
Minclude/linux/sunrpc/svc_xprt.h | 1-
Minclude/linux/sunrpc/xdr.h | 11++++++++---
Minclude/linux/sunrpc/xprt.h | 35+++++++++++++++++++++++++++--------
Minclude/linux/sunrpc/xprtsock.h | 36+++++++++++++++++-------------------
Minclude/trace/events/rpcrdma.h | 18+++++++++---------
Minclude/trace/events/sunrpc.h | 37++++++++++++-------------------------
Mnet/sunrpc/auth.c | 310+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Mnet/sunrpc/auth_generic.c | 2+-
Mnet/sunrpc/auth_gss/auth_gss.c | 45+++++++++++++++++++++++++++++++++++++++++++--
Mnet/sunrpc/auth_gss/gss_krb5_seal.c | 38+++++++++++++++++++++++++++-----------
Mnet/sunrpc/auth_gss/gss_krb5_wrap.c | 8++------
Mnet/sunrpc/auth_gss/gss_mech_switch.c | 28++++++++++++++--------------
Mnet/sunrpc/auth_gss/gss_rpc_xdr.c | 1+
Mnet/sunrpc/auth_null.c | 6+++---
Mnet/sunrpc/auth_unix.c | 4++--
Mnet/sunrpc/backchannel_rqst.c | 1-
Mnet/sunrpc/clnt.c | 174+++++++++++++++++++++++++++++++++++++------------------------------------------
Mnet/sunrpc/sched.c | 178++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
Mnet/sunrpc/socklib.c | 10+++++-----
Mnet/sunrpc/svc_xprt.c | 2--
Mnet/sunrpc/svcsock.c | 6+++---
Mnet/sunrpc/xdr.c | 34++++++++++++++++++++++++++++++++++
Mnet/sunrpc/xprt.c | 908++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
Mnet/sunrpc/xprtrdma/backchannel.c | 20+++++++++++---------
Mnet/sunrpc/xprtrdma/fmr_ops.c | 131++++++++++++++++++++++++++++++++++++-------------------------------------------
Mnet/sunrpc/xprtrdma/frwr_ops.c | 137++++++++++++++++++++++++++++++-------------------------------------------------
Mnet/sunrpc/xprtrdma/rpc_rdma.c | 30+++++++++++++-----------------
Mnet/sunrpc/xprtrdma/svc_rdma_backchannel.c | 15+++++----------
Mnet/sunrpc/xprtrdma/transport.c | 120++++++++++++++++++++++++++++++++++++++-----------------------------------------
Mnet/sunrpc/xprtrdma/verbs.c | 178+++++++++++++++++++++++++++++++++++++++++--------------------------------------
Mnet/sunrpc/xprtrdma/xprt_rdma.h | 18+++++++++---------
Mnet/sunrpc/xprtsock.c | 1107++++++++++++++++++++++++++++++++++---------------------------------------------
57 files changed, 2481 insertions(+), 2039 deletions(-)

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c @@ -93,7 +93,7 @@ int nfs4_check_delegation(struct inode *inode, fmode_t flags) return nfs4_do_check_delegation(inode, flags, false); } -static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_stateid *stateid) { struct inode *inode = state->inode; struct file_lock *fl; @@ -108,7 +108,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_ spin_lock(&flctx->flc_lock); restart: list_for_each_entry(fl, list, fl_list) { - if (nfs_file_open_context(fl->fl_file) != ctx) + if (nfs_file_open_context(fl->fl_file)->state != state) continue; spin_unlock(&flctx->flc_lock); status = nfs4_lock_delegation_recall(fl, state, stateid); @@ -136,8 +136,8 @@ static int nfs_delegation_claim_opens(struct inode *inode, int err; again: - spin_lock(&inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { state = ctx->state; if (state == NULL) continue; @@ -147,15 +147,16 @@ again: continue; if (!nfs4_stateid_match(&state->stateid, stateid)) continue; - get_nfs_open_context(ctx); - spin_unlock(&inode->i_lock); + if (!get_nfs_open_context(ctx)) + continue; + rcu_read_unlock(); sp = state->owner; /* Block nfs4_proc_unlck */ mutex_lock(&sp->so_delegreturn_mutex); seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); err = nfs4_open_delegation_recall(ctx, state, stateid, type); if (!err) - err = nfs_delegation_claim_locks(ctx, state, stateid); + err = nfs_delegation_claim_locks(state, stateid); if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) err = -EAGAIN; mutex_unlock(&sp->so_delegreturn_mutex); @@ -164,7 +165,7 @@ again: return err; goto again; } - spin_unlock(&inode->i_lock); + rcu_read_unlock(); return 0; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c @@ -1072,6 +1072,100 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU); } +static int +nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, + struct inode *inode, int error) +{ + switch (error) { + case 1: + dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", + __func__, dentry); + return 1; + case 0: + nfs_mark_for_revalidate(dir); + if (inode && S_ISDIR(inode->i_mode)) { + /* Purge readdir caches. */ + nfs_zap_caches(inode); + /* + * We can't d_drop the root of a disconnected tree: + * its d_hash is on the s_anon list and d_drop() would hide + * it from shrink_dcache_for_unmount(), leading to busy + * inodes on unmount and further oopses. + */ + if (IS_ROOT(dentry)) + return 1; + } + dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", + __func__, dentry); + return 0; + } + dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n", + __func__, dentry, error); + return error; +} + +static int +nfs_lookup_revalidate_negative(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + int ret = 1; + if (nfs_neg_need_reval(dir, dentry, flags)) { + if (flags & LOOKUP_RCU) + return -ECHILD; + ret = 0; + } + return nfs_lookup_revalidate_done(dir, dentry, NULL, ret); +} + +static int +nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + return nfs_lookup_revalidate_done(dir, dentry, inode, 1); +} + +static int +nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + struct nfs_fh *fhandle; + struct nfs_fattr *fattr; + struct nfs4_label *label; + int ret; + + ret = -ENOMEM; + fhandle = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); + if (fhandle == NULL || fattr == NULL || IS_ERR(label)) + goto out; + + ret = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); + if (ret < 0) { + if (ret == -ESTALE || ret == -ENOENT) + ret = 0; + goto out; + } + ret = 0; + if (nfs_compare_fh(NFS_FH(inode), fhandle)) + goto out; + if (nfs_refresh_inode(inode, fattr) < 0) + goto out; + + nfs_setsecurity(inode, fattr, label); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + + /* set a readdirplus hint that we had a cache miss */ + nfs_force_use_readdirplus(dir); + ret = 1; +out: + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + nfs4_label_free(label); + return nfs_lookup_revalidate_done(dir, dentry, inode, ret); +} + /* * This is called every time the dcache has a lookup hit, * and we should check whether we can really trust that @@ -1083,58 +1177,36 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, * If the parent directory is seen to have changed, we throw out the * cached dentry and do a new lookup. */ -static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) +static int +nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, + unsigned int flags) { - struct inode *dir; struct inode *inode; - struct dentry *parent; - struct nfs_fh *fhandle = NULL; - struct nfs_fattr *fattr = NULL; - struct nfs4_label *label = NULL; int error; - if (flags & LOOKUP_RCU) { - parent = READ_ONCE(dentry->d_parent); - dir = d_inode_rcu(parent); - if (!dir) - return -ECHILD; - } else { - parent = dget_parent(dentry); - dir = d_inode(parent); - } nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); inode = d_inode(dentry); - if (!inode) { - if (nfs_neg_need_reval(dir, dentry, flags)) { - if (flags & LOOKUP_RCU) - return -ECHILD; - goto out_bad; - } - goto out_valid; - } + if (!inode) + return nfs_lookup_revalidate_negative(dir, dentry, flags); if (is_bad_inode(inode)) { - if (flags & LOOKUP_RCU) - return -ECHILD; dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", __func__, dentry); goto out_bad; } if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ)) - goto out_set_verifier; + return nfs_lookup_revalidate_delegated(dir, dentry, inode); /* Force a full look up iff the parent directory has changed */ if (!(flags & (LOOKUP_EXCL | LOOKUP_REVAL)) && nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) { error = nfs_lookup_verify_inode(inode, flags); if (error) { - if (flags & LOOKUP_RCU) - return -ECHILD; if (error == -ESTALE) - goto out_zap_parent; - goto out_error; + nfs_zap_caches(dir); + goto out_bad; } nfs_advise_use_readdirplus(dir); goto out_valid; @@ -1146,81 +1218,45 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (NFS_STALE(inode)) goto out_bad; - error = -ENOMEM; - fhandle = nfs_alloc_fhandle(); - fattr = nfs_alloc_fattr(); - if (fhandle == NULL || fattr == NULL) - goto out_error; - - label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); - if (IS_ERR(label)) - goto out_error; - trace_nfs_lookup_revalidate_enter(dir, dentry, flags); - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); + error = nfs_lookup_revalidate_dentry(dir, dentry, inode); trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error); - if (error == -ESTALE || error == -ENOENT) - goto out_bad; - if (error) - goto out_error; - if (nfs_compare_fh(NFS_FH(inode), fhandle)) - goto out_bad; - if ((error = nfs_refresh_inode(inode, fattr)) != 0) - goto out_bad; - - nfs_setsecurity(inode, fattr, label); - - nfs_free_fattr(fattr); - nfs_free_fhandle(fhandle); - nfs4_label_free(label); + return error; +out_valid: + return nfs_lookup_revalidate_done(dir, dentry, inode, 1); +out_bad: + if (flags & LOOKUP_RCU) + return -ECHILD; + return nfs_lookup_revalidate_done(dir, dentry, inode, 0); +} - /* set a readdirplus hint that we had a cache miss */ - nfs_force_use_readdirplus(dir); +static int +__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags, + int (*reval)(struct inode *, struct dentry *, unsigned int)) +{ + struct dentry *parent; + struct inode *dir; + int ret; -out_set_verifier: - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - out_valid: if (flags & LOOKUP_RCU) { + parent = READ_ONCE(dentry->d_parent); + dir = d_inode_rcu(parent); + if (!dir) + return -ECHILD; + ret = reval(dir, dentry, flags); if (parent != READ_ONCE(dentry->d_parent)) return -ECHILD; - } else + } else { + parent = dget_parent(dentry); + ret = reval(d_inode(parent), dentry, flags); dput(parent); - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", - __func__, dentry); - return 1; -out_zap_parent: - nfs_zap_caches(dir); - out_bad: - WARN_ON(flags & LOOKUP_RCU); - nfs_free_fattr(fattr); - nfs_free_fhandle(fhandle); - nfs4_label_free(label); - nfs_mark_for_revalidate(dir); - if (inode && S_ISDIR(inode->i_mode)) { - /* Purge readdir caches. */ - nfs_zap_caches(inode); - /* - * We can't d_drop the root of a disconnected tree: - * its d_hash is on the s_anon list and d_drop() would hide - * it from shrink_dcache_for_unmount(), leading to busy - * inodes on unmount and further oopses. - */ - if (IS_ROOT(dentry)) - goto out_valid; } - dput(parent); - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", - __func__, dentry); - return 0; -out_error: - WARN_ON(flags & LOOKUP_RCU); - nfs_free_fattr(fattr); - nfs_free_fhandle(fhandle); - nfs4_label_free(label); - dput(parent); - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n", - __func__, dentry, error); - return error; + return ret; +} + +static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) +{ + return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate); } /* @@ -1579,62 +1615,55 @@ no_open: } EXPORT_SYMBOL_GPL(nfs_atomic_open); -static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) +static int +nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, + unsigned int flags) { struct inode *inode; - int ret = 0; if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) - goto no_open; + goto full_reval; if (d_mountpoint(dentry)) - goto no_open; - if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1) - goto no_open; + goto full_reval; inode = d_inode(dentry); /* We can't create new files in nfs_open_revalidate(), so we * optimize away revalidation of negative dentries. */ - if (inode == NULL) { - struct dentry *parent; - struct inode *dir; - - if (flags & LOOKUP_RCU) { - parent = READ_ONCE(dentry->d_parent); - dir = d_inode_rcu(parent); - if (!dir) - return -ECHILD; - } else { - parent = dget_parent(dentry); - dir = d_inode(parent); - } - if (!nfs_neg_need_reval(dir, dentry, flags)) - ret = 1; - else if (flags & LOOKUP_RCU) - ret = -ECHILD; - if (!(flags & LOOKUP_RCU)) - dput(parent); - else if (parent != READ_ONCE(dentry->d_parent)) - return -ECHILD; - goto out; - } + if (inode == NULL) + goto full_reval; + + if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ)) + return nfs_lookup_revalidate_delegated(dir, dentry, inode); /* NFS only supports OPEN on regular files */ if (!S_ISREG(inode->i_mode)) - goto no_open; + goto full_reval; + /* We cannot do exclusive creation on a positive dentry */ - if (flags & LOOKUP_EXCL) - goto no_open; + if (flags & (LOOKUP_EXCL | LOOKUP_REVAL)) + goto reval_dentry; + + /* Check if the directory changed */ + if (!nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) + goto reval_dentry; /* Let f_op->open() actually open (and revalidate) the file */ - ret = 1; + return 1; +reval_dentry: + if (flags & LOOKUP_RCU) + return -ECHILD; + return nfs_lookup_revalidate_dentry(dir, dentry, inode);; -out: - return ret; +full_reval: + return nfs_do_lookup_revalidate(dir, dentry, flags); +} -no_open: - return nfs_lookup_revalidate(dentry, flags); +static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) +{ + return __nfs_lookup_revalidate(dentry, flags, + nfs4_do_lookup_revalidate); } #endif /* CONFIG_NFSV4 */ diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c @@ -1164,6 +1164,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", .owner = THIS_MODULE, + .max_layoutget_response = 4096, /* 1 page or so... */ .alloc_layout_hdr = filelayout_alloc_layout_hdr, .free_layout_hdr = filelayout_free_layout_hdr, .alloc_lseg = filelayout_alloc_lseg, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -2356,6 +2356,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .name = "LAYOUT_FLEX_FILES", .owner = THIS_MODULE, .flags = PNFS_LAYOUTGET_ON_OPEN, + .max_layoutget_response = 4096, /* 1 page or so... */ .set_layoutdriver = ff_layout_set_layoutdriver, .alloc_layout_hdr = ff_layout_alloc_layout_hdr, .free_layout_hdr = ff_layout_free_layout_hdr, diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -453,7 +453,7 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); struct rpc_cred *cred; - if (mirror) { + if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) { cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode); if (!cred) cred = get_rpccred(mdscred); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c @@ -857,15 +857,14 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) { - struct nfs_lock_context *head = &ctx->lock_context; - struct nfs_lock_context *pos = head; + struct nfs_lock_context *pos; - do { + list_for_each_entry_rcu(pos, &ctx->lock_context.list, list) { if (pos->lockowner != current->files) continue; - refcount_inc(&pos->count); - return pos; - } while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head); + if (refcount_inc_not_zero(&pos->count)) + return pos; + } return NULL; } @@ -874,10 +873,10 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) struct nfs_lock_context *res, *new = NULL; struct inode *inode = d_inode(ctx->dentry); - spin_lock(&inode->i_lock); + rcu_read_lock(); res = __nfs_find_lock_context(ctx); + rcu_read_unlock(); if (res == NULL) { - spin_unlock(&inode->i_lock); new = kmalloc(sizeof(*new), GFP_KERNEL); if (new == NULL) return ERR_PTR(-ENOMEM); @@ -885,14 +884,14 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) spin_lock(&inode->i_lock); res = __nfs_find_lock_context(ctx); if (res == NULL) { - list_add_tail(&new->list, &ctx->lock_context.list); + list_add_tail_rcu(&new->list, &ctx->lock_context.list); new->open_context = ctx; res = new; new = NULL; } + spin_unlock(&inode->i_lock); + kfree(new); } - spin_unlock(&inode->i_lock); - kfree(new); return res; } EXPORT_SYMBOL_GPL(nfs_get_lock_context); @@ -904,9 +903,9 @@ void nfs_put_lock_context(struct nfs_lock_context *l_ctx) if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock)) return; - list_del(&l_ctx->list); + list_del_rcu(&l_ctx->list); spin_unlock(&inode->i_lock); - kfree(l_ctx); + kfree_rcu(l_ctx, rcu_head); } EXPORT_SYMBOL_GPL(nfs_put_lock_context); @@ -978,9 +977,9 @@ EXPORT_SYMBOL_GPL(alloc_nfs_open_context); struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) { - if (ctx != NULL) - refcount_inc(&ctx->lock_context.count); - return ctx; + if (ctx != NULL && refcount_inc_not_zero(&ctx->lock_context.count)) + return ctx; + return NULL; } EXPORT_SYMBOL_GPL(get_nfs_open_context); @@ -989,13 +988,13 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) struct inode *inode = d_inode(ctx->dentry); struct super_block *sb = ctx->dentry->d_sb; + if (!refcount_dec_and_test(&ctx->lock_context.count)) + return; if (!list_empty(&ctx->list)) { - if (!refcount_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) - return; - list_del(&ctx->list); + spin_lock(&inode->i_lock); + list_del_rcu(&ctx->list); spin_unlock(&inode->i_lock); - } else if (!refcount_dec_and_test(&ctx->lock_context.count)) - return; + } if (inode != NULL) NFS_PROTO(inode)->close_context(ctx, is_sync); if (ctx->cred != NULL) @@ -1003,7 +1002,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) dput(ctx->dentry); nfs_sb_deactive(sb); kfree(ctx->mdsthreshold); - kfree(ctx); + kfree_rcu(ctx, rcu_head); } void put_nfs_open_context(struct nfs_open_context *ctx) @@ -1027,10 +1026,7 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); - if (ctx->mode & FMODE_WRITE) - list_add(&ctx->list, &nfsi->open_files); - else - list_add_tail(&ctx->list, &nfsi->open_files); + list_add_tail_rcu(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); @@ -1051,16 +1047,17 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *pos, *ctx = NULL; - spin_lock(&inode->i_lock); - list_for_each_entry(pos, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, &nfsi->open_files, list) { if (cred != NULL && pos->cred != cred) continue; if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode) continue; ctx = get_nfs_open_context(pos); - break; + if (ctx) + break; } - spin_unlock(&inode->i_lock); + rcu_read_unlock(); return ctx; } @@ -1078,9 +1075,6 @@ void nfs_file_clear_open_context(struct file *filp) if (ctx->error < 0) invalidate_inode_pages2(inode->i_mapping); filp->private_data = NULL; - spin_lock(&inode->i_lock); - list_move_tail(&ctx->list, &NFS_I(inode)->open_files); - spin_unlock(&inode->i_lock); put_nfs_open_context_sync(ctx); } } @@ -1329,19 +1323,11 @@ static bool nfs_file_has_writers(struct nfs_inode *nfsi) { struct inode *inode = &nfsi->vfs_inode; - assert_spin_locked(&inode->i_lock); - if (!S_ISREG(inode->i_mode)) return false; if (list_empty(&nfsi->open_files)) return false; - /* Note: This relies on nfsi->open_files being ordered with writers - * being placed at the head of the list. - * See nfs_inode_attach_open_context() - */ - return (list_first_entry(&nfsi->open_files, - struct nfs_open_context, - list)->mode & FMODE_WRITE) == FMODE_WRITE; + return inode_is_open_for_write(inode); } static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c @@ -786,6 +786,7 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { struct inode *inode = hdr->inode; + struct nfs_server *server = NFS_SERVER(inode); if (hdr->pgio_done_cb != NULL) return hdr->pgio_done_cb(task, hdr); @@ -793,6 +794,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; + if (task->tk_status >= 0 && !server->read_hdrsize) + cmpxchg(&server->read_hdrsize, 0, hdr->res.replen); + nfs_invalidate_atime(inode); nfs_refresh_inode(inode, &hdr->fattr); return 0; @@ -802,6 +806,7 @@ static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr, struct rpc_message *msg) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ]; + hdr->args.replen = NFS_SERVER(hdr->inode)->read_hdrsize; } static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c @@ -983,10 +983,11 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, const void *data) { const struct nfs_pgio_args *args = data; + unsigned int replen = args->replen ? args->replen : NFS3_readres_sz; encode_read3args(xdr, args); prepare_reply_buffer(req, args->pages, args->pgbase, - args->count, NFS3_readres_sz); + args->count, replen); req->rq_rcv_buf.flags |= XDRBUF_READ; } @@ -1364,10 +1365,12 @@ static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req, encode_nfs_fh3(xdr, args->fh); encode_uint32(xdr, args->mask); - if (args->mask & (NFS_ACL | NFS_DFACL)) + if (args->mask & (NFS_ACL | NFS_DFACL)) { prepare_reply_buffer(req, args->pages, 0, NFSACL_MAXPAGES << PAGE_SHIFT, ACL3_getaclres_sz); + req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; + } } static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req, @@ -1673,9 +1676,11 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { struct nfs_pgio_res *result = data; + unsigned int pos; enum nfs_stat status; int error; + pos = xdr_stream_pos(xdr); error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; @@ -1685,6 +1690,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, result->op_status = status; if (status != NFS3_OK) goto out_status; + result->replen = 3 + ((xdr_stream_pos(xdr) - pos) >> 2); error = decode_read3resok(xdr, result); out: return error; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h @@ -188,9 +188,10 @@ struct nfs4_state { unsigned int n_wronly; /* Number of write-only references */ unsigned int n_rdwr; /* Number of read/write references */ fmode_t state; /* State on the server (R,W, or RW) */ - atomic_t count; + refcount_t count; wait_queue_head_t waitq; + struct rcu_head rcu_head; }; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c @@ -950,10 +950,10 @@ EXPORT_SYMBOL_GPL(nfs4_set_ds_client); /* * Session has been established, and the client marked ready. - * Set the mount rsize and wsize with negotiated fore channel - * attributes which will be bound checked in nfs_server_set_fsinfo. + * Limit the mount rsize, wsize and dtsize using negotiated fore + * channel attributes. */ -static void nfs4_session_set_rwsize(struct nfs_server *server) +static void nfs4_session_limit_rwsize(struct nfs_server *server) { #ifdef CONFIG_NFS_V4_1 struct nfs4_session *sess; @@ -966,9 +966,11 @@ static void nfs4_session_set_rwsize(struct nfs_server *server) server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead; server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead; - if (!server->rsize || server->rsize > server_resp_sz) + if (server->dtsize > server_resp_sz) + server->dtsize = server_resp_sz; + if (server->rsize > server_resp_sz) server->rsize = server_resp_sz; - if (!server->wsize || server->wsize > server_rqst_sz) + if (server->wsize > server_rqst_sz) server->wsize = server_rqst_sz; #endif /* CONFIG_NFS_V4_1 */ } @@ -1015,12 +1017,12 @@ static int nfs4_server_common_setup(struct nfs_server *server, (unsigned long long) server->fsid.minor); nfs_display_fhandle(mntfh, "Pseudo-fs root FH"); - nfs4_session_set_rwsize(server); - error = nfs_probe_fsinfo(server, mntfh, fattr); if (error < 0) goto out; + nfs4_session_limit_rwsize(server); + if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c @@ -1349,12 +1349,20 @@ static bool nfs4_mode_match_open_stateid(struct nfs4_state *state, return false; } -static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode) +static int can_open_cached(struct nfs4_state *state, fmode_t mode, + int open_mode, enum open_claim_type4 claim) { int ret = 0; if (open_mode & (O_EXCL|O_TRUNC)) goto out; + switch (claim) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_FH: + goto out; + default: + break; + } switch (mode & (FMODE_READ|FMODE_WRITE)) { case FMODE_READ: ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 @@ -1747,7 +1755,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) for (;;) { spin_lock(&state->owner->so_lock); - if (can_open_cached(state, fmode, open_mode)) { + if (can_open_cached(state, fmode, open_mode, claim)) { update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); goto out_return_state; @@ -1777,7 +1785,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) out: return ERR_PTR(ret); out_return_state: - atomic_inc(&state->count); + refcount_inc(&state->count); return state; } @@ -1849,7 +1857,7 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) update: update_open_stateid(state, &data->o_res.stateid, NULL, data->o_arg.fmode); - atomic_inc(&state->count); + refcount_inc(&state->count); return state; } @@ -1887,7 +1895,7 @@ nfs4_opendata_find_nfs4_state(struct nfs4_opendata *data) return ERR_CAST(inode); if (data->state != NULL && data->state->inode == inode) { state = data->state; - atomic_inc(&state->count); + refcount_inc(&state->count); } else state = nfs4_get_open_state(inode, data->owner); iput(inode); @@ -1933,23 +1941,41 @@ nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) return ret; } -static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) +static struct nfs_open_context * +nfs4_state_find_open_context_mode(struct nfs4_state *state, fmode_t mode) { struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_open_context *ctx; - spin_lock(&state->inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { if (ctx->state != state) continue; - get_nfs_open_context(ctx); - spin_unlock(&state->inode->i_lock); + if ((ctx->mode & mode) != mode) + continue; + if (!get_nfs_open_context(ctx)) + continue; + rcu_read_unlock(); return ctx; } - spin_unlock(&state->inode->i_lock); + rcu_read_unlock(); return ERR_PTR(-ENOENT); } +static struct nfs_open_context * +nfs4_state_find_open_context(struct nfs4_state *state) +{ + struct nfs_open_context *ctx; + + ctx = nfs4_state_find_open_context_mode(state, FMODE_READ|FMODE_WRITE); + if (!IS_ERR(ctx)) + return ctx; + ctx = nfs4_state_find_open_context_mode(state, FMODE_WRITE); + if (!IS_ERR(ctx)) + return ctx; + return nfs4_state_find_open_context_mode(state, FMODE_READ); +} + static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, struct nfs4_state *state, enum open_claim_type4 claim) { @@ -1960,7 +1986,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context if (opendata == NULL) return ERR_PTR(-ENOMEM); opendata->state = state; - atomic_inc(&state->count); + refcount_inc(&state->count); return opendata; } @@ -2276,7 +2302,8 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) if (data->state != NULL) { struct nfs_delegation *delegation; - if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags)) + if (can_open_cached(data->state, data->o_arg.fmode, + data->o_arg.open_flags, claim)) goto out_no_action; rcu_read_lock(); delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c @@ -655,7 +655,7 @@ nfs4_alloc_open_state(void) state = kzalloc(sizeof(*state), GFP_NOFS); if (!state) return NULL; - atomic_set(&state->count, 1); + refcount_set(&state->count, 1); INIT_LIST_HEAD(&state->lock_states); spin_lock_init(&state->state_lock); seqlock_init(&state->seqlock); @@ -684,12 +684,12 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner) struct nfs_inode *nfsi = NFS_I(inode); struct nfs4_state *state; - list_for_each_entry(state, &nfsi->open_states, inode_states) { + list_for_each_entry_rcu(state, &nfsi->open_states, inode_states) { if (state->owner != owner) continue; if (!nfs4_valid_open_stateid(state)) continue; - if (atomic_inc_not_zero(&state->count)) + if (refcount_inc_not_zero(&state->count)) return state; } return NULL; @@ -698,7 +698,7 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner) static void nfs4_free_open_state(struct nfs4_state *state) { - kfree(state); + kfree_rcu(state, rcu_head); } struct nfs4_state * @@ -707,9 +707,9 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner) struct nfs4_state *state, *new; struct nfs_inode *nfsi = NFS_I(inode); - spin_lock(&inode->i_lock); + rcu_read_lock(); state = __nfs4_find_state_byowner(inode, owner); - spin_unlock(&inode->i_lock); + rcu_read_unlock(); if (state) goto out; new = nfs4_alloc_open_state(); @@ -720,7 +720,7 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner) state = new; state->owner = owner; atomic_inc(&owner->so_count); - list_add(&state->inode_states, &nfsi->open_states); + list_add_rcu(&state->inode_states, &nfsi->open_states); ihold(inode); state->inode = inode; spin_unlock(&inode->i_lock); @@ -743,10 +743,10 @@ void nfs4_put_open_state(struct nfs4_state *state) struct inode *inode = state->inode; struct nfs4_state_owner *owner = state->owner; - if (!atomic_dec_and_lock(&state->count, &owner->so_lock)) + if (!refcount_dec_and_lock(&state->count, &owner->so_lock)) return; spin_lock(&inode->i_lock); - list_del(&state->inode_states); + list_del_rcu(&state->inode_states); list_del(&state->open_states); spin_unlock(&inode->i_lock); spin_unlock(&owner->so_lock); @@ -1437,8 +1437,8 @@ void nfs_inode_find_state_and_recover(struct inode *inode, struct nfs4_state *state; bool found = false; - spin_lock(&inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { state = ctx->state; if (state == NULL) continue; @@ -1456,7 +1456,7 @@ void nfs_inode_find_state_and_recover(struct inode *inode, nfs4_state_mark_reclaim_nograce(clp, state)) found = true; } - spin_unlock(&inode->i_lock); + rcu_read_unlock(); nfs_inode_find_delegation_state_and_recover(inode, stateid); if (found) @@ -1469,13 +1469,13 @@ static void nfs4_state_mark_open_context_bad(struct nfs4_state *state) struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *ctx; - spin_lock(&inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { if (ctx->state != state) continue; set_bit(NFS_CONTEXT_BAD, &ctx->flags); } - spin_unlock(&inode->i_lock); + rcu_read_unlock(); } static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error) @@ -1549,10 +1549,62 @@ out: return status; } +#ifdef CONFIG_NFS_V4_2 +static void nfs42_complete_copies(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ + struct nfs4_copy_state *copy; + + if (!test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags)) + return; + + spin_lock(&sp->so_server->nfs_client->cl_lock); + list_for_each_entry(copy, &sp->so_server->ss_copies, copies) { + if (nfs4_stateid_match_other(&state->stateid, &copy->parent_state->stateid)) + continue; + copy->flags = 1; + complete(&copy->completion); + break; + } + spin_unlock(&sp->so_server->nfs_client->cl_lock); +} +#else /* !CONFIG_NFS_V4_2 */ +static inline void nfs42_complete_copies(struct nfs4_state_owner *sp, + struct nfs4_state *state) +{ +} +#endif /* CONFIG_NFS_V4_2 */ + +static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_state *state, + const struct nfs4_state_recovery_ops *ops) +{ + struct nfs4_lock_state *lock; + int status; + + status = ops->recover_open(sp, state); + if (status < 0) + return status; + + status = nfs4_reclaim_locks(state, ops); + if (status < 0) + return status; + + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) { + spin_lock(&state->state_lock); + list_for_each_entry(lock, &state->lock_states, ls_locks) { + if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) + pr_warn_ratelimited("NFS: %s: Lock reclaim failed!\n", __func__); + } + spin_unlock(&state->state_lock); + } + + nfs42_complete_copies(sp, state); + clear_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); + return status; +} + static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops) { struct nfs4_state *state; - struct nfs4_lock_state *lock; int status = 0; /* Note: we rely on the sp->so_states list being ordered @@ -1573,79 +1625,45 @@ restart: continue; if (state->state == 0) continue; - atomic_inc(&state->count); + refcount_inc(&state->count); spin_unlock(&sp->so_lock); - status = ops->recover_open(sp, state); - if (status >= 0) { - status = nfs4_reclaim_locks(state, ops); - if (status >= 0) { - if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) { - spin_lock(&state->state_lock); - list_for_each_entry(lock, &state->lock_states, ls_locks) { - if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) - pr_warn_ratelimited("NFS: " - "%s: Lock reclaim " - "failed!\n", __func__); - } - spin_unlock(&state->state_lock); - } - clear_bit(NFS_STATE_RECLAIM_NOGRACE, - &state->flags); -#ifdef CONFIG_NFS_V4_2 - if (test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags)) { - struct nfs4_copy_state *copy; - - spin_lock(&sp->so_server->nfs_client->cl_lock); - list_for_each_entry(copy, &sp->so_server->ss_copies, copies) { - if (memcmp(&state->stateid.other, &copy->parent_state->stateid.other, NFS4_STATEID_SIZE)) - continue; - copy->flags = 1; - complete(&copy->completion); - printk("AGLO: server rebooted waking up the copy\n"); - break; - } - spin_unlock(&sp->so_server->nfs_client->cl_lock); - } -#endif /* CONFIG_NFS_V4_2 */ - nfs4_put_open_state(state); - spin_lock(&sp->so_lock); - goto restart; - } - } + status = __nfs4_reclaim_open_state(sp, state, ops); + switch (status) { - default: - printk(KERN_ERR "NFS: %s: unhandled error %d\n", - __func__, status); - /* Fall through */ - case -ENOENT: - case -ENOMEM: - case -EACCES: - case -EROFS: - case -EIO: - case -ESTALE: - /* Open state on this file cannot be recovered */ - nfs4_state_mark_recovery_failed(state, status); - break; - case -EAGAIN: - ssleep(1); - /* Fall through */ - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_OLD_STATEID: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_RECLAIM_BAD: - case -NFS4ERR_RECLAIM_CONFLICT: - nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + default: + if (status >= 0) break; - case -NFS4ERR_EXPIRED: - case -NFS4ERR_NO_GRACE: - nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - goto out_err; + printk(KERN_ERR "NFS: %s: unhandled error %d\n", __func__, status); + /* Fall through */ + case -ENOENT: + case -ENOMEM: + case -EACCES: + case -EROFS: + case -EIO: + case -ESTALE: + /* Open state on this file cannot be recovered */ + nfs4_state_mark_recovery_failed(state, status); + break; + case -EAGAIN: + ssleep(1); + /* Fall through */ + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_NO_GRACE: + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + goto out_err; } nfs4_put_open_state(state); spin_lock(&sp->so_lock); @@ -1795,38 +1813,38 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) { switch (error) { - case 0: - break; - case -NFS4ERR_CB_PATH_DOWN: - nfs40_handle_cb_pathdown(clp); - break; - case -NFS4ERR_NO_GRACE: - nfs4_state_end_reclaim_reboot(clp); - break; - case -NFS4ERR_STALE_CLIENTID: - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_start_reclaim_reboot(clp); - break; - case -NFS4ERR_EXPIRED: - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_start_reclaim_nograce(clp); - break; - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_DEADSESSION: - case -NFS4ERR_SEQ_FALSE_RETRY: - case -NFS4ERR_SEQ_MISORDERED: - set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); - /* Zero session reset errors */ - break; - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); - break; - default: - dprintk("%s: failed to handle error %d for server %s\n", - __func__, error, clp->cl_hostname); - return error; + case 0: + break; + case -NFS4ERR_CB_PATH_DOWN: + nfs40_handle_cb_pathdown(clp); + break; + case -NFS4ERR_NO_GRACE: + nfs4_state_end_reclaim_reboot(clp); + break; + case -NFS4ERR_STALE_CLIENTID: + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_reboot(clp); + break; + case -NFS4ERR_EXPIRED: + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_nograce(clp); + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + /* Zero session reset errors */ + break; + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + break; + default: + dprintk("%s: failed to handle error %d for server %s\n", + __func__, error, clp->cl_hostname); + return error; } dprintk("%s: handled error %d for server %s\n", __func__, error, clp->cl_hostname); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c @@ -3516,7 +3516,7 @@ static int decode_attr_exclcreat_supported(struct xdr_stream *xdr, static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh) { __be32 *p; - int len; + u32 len; if (fh != NULL) memset(fh, 0, sizeof(*fh)); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c @@ -63,14 +63,14 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) { - spin_lock(&hdr->lock); - if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags) - || pos < hdr->io_start + hdr->good_bytes) { + unsigned int new = pos - hdr->io_start; + + if (hdr->good_bytes > new) { + hdr->good_bytes = new; clear_bit(NFS_IOHDR_EOF, &hdr->flags); - hdr->good_bytes = pos - hdr->io_start; - hdr->error = error; + if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags)) + hdr->error = error; } - spin_unlock(&hdr->lock); } static inline struct nfs_page * @@ -494,7 +494,6 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops) if (hdr) { INIT_LIST_HEAD(&hdr->pages); - spin_lock_init(&hdr->lock); hdr->rw_ops = ops; } return hdr; @@ -1111,6 +1110,20 @@ static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc, return ret; } +static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc) +{ + u32 midx; + struct nfs_pgio_mirror *mirror; + + if (!desc->pg_error) + return; + + for (midx = 0; midx < desc->pg_mirror_count; midx++) { + mirror = &desc->pg_mirrors[midx]; + desc->pg_completion_ops->error_cleanup(&mirror->pg_list); + } +} + int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { @@ -1161,25 +1174,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, return 1; out_failed: - /* - * We might have failed before sending any reqs over wire. - * Clean up rest of the reqs in mirror pg_list. - */ - if (desc->pg_error) { - struct nfs_pgio_mirror *mirror; - void (*func)(struct list_head *); - - /* remember fatal errors */ - if (nfs_error_is_fatal(desc->pg_error)) - nfs_context_set_write_error(req->wb_context, - desc->pg_error); - - func = desc->pg_completion_ops->error_cleanup; - for (midx = 0; midx < desc->pg_mirror_count; midx++) { - mirror = &desc->pg_mirrors[midx]; - func(&mirror->pg_list); - } - } + nfs_pageio_error_cleanup(desc); return 0; } @@ -1251,6 +1246,8 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) for (midx = 0; midx < desc->pg_mirror_count; midx++) nfs_pageio_complete_mirror(desc, midx); + if (desc->pg_error < 0) + nfs_pageio_error_cleanup(desc); if (desc->pg_ops->pg_cleanup) desc->pg_ops->pg_cleanup(desc); nfs_pageio_cleanup_mirroring(desc); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c @@ -965,7 +965,7 @@ static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) struct page **pages; int i; - pages = kcalloc(size, sizeof(struct page *), gfp_flags); + pages = kmalloc_array(size, sizeof(struct page *), gfp_flags); if (!pages) { dprintk("%s: can't alloc array of %zu pages\n", __func__, size); return NULL; @@ -975,7 +975,7 @@ static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) pages[i] = alloc_page(gfp_flags); if (!pages[i]) { dprintk("%s: failed to allocate page\n", __func__); - nfs4_free_pages(pages, size); + nfs4_free_pages(pages, i); return NULL; } } @@ -991,6 +991,7 @@ pnfs_alloc_init_layoutget_args(struct inode *ino, gfp_t gfp_flags) { struct nfs_server *server = pnfs_find_server(ino, ctx); + size_t max_reply_sz = server->pnfs_curr_ld->max_layoutget_response; size_t max_pages = max_response_pages(server); struct nfs4_layoutget *lgp; @@ -1000,6 +1001,12 @@ pnfs_alloc_init_layoutget_args(struct inode *ino, if (lgp == NULL) return NULL; + if (max_reply_sz) { + size_t npages = (max_reply_sz + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (npages < max_pages) + max_pages = npages; + } + lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); if (!lgp->args.layout.pages) { kfree(lgp); @@ -1332,6 +1339,7 @@ bool pnfs_roc(struct inode *ino, if (!nfs_have_layout(ino)) return false; retry: + rcu_read_lock(); spin_lock(&ino->i_lock); lo = nfsi->layout; if (!lo || !pnfs_layout_is_valid(lo) || @@ -1342,6 +1350,7 @@ retry: pnfs_get_layout_hdr(lo); if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { spin_unlock(&ino->i_lock); + rcu_read_unlock(); wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE); pnfs_put_layout_hdr(lo); @@ -1355,7 +1364,7 @@ retry: skip_read = true; } - list_for_each_entry(ctx, &nfsi->open_files, list) { + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { state = ctx->state; if (state == NULL) continue; @@ -1403,6 +1412,7 @@ retry: out_noroc: spin_unlock(&ino->i_lock); + rcu_read_unlock(); pnfs_layoutcommit_inode(ino, true); if (roc) { struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h @@ -125,6 +125,7 @@ struct pnfs_layoutdriver_type { struct module *owner; unsigned flags; unsigned max_deviceinfo_size; + unsigned max_layoutget_response; int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); int (*clear_layoutdriver) (struct nfs_server *); diff --git a/fs/nfs/read.c b/fs/nfs/read.c @@ -276,16 +276,14 @@ static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_header *hdr) { if (hdr->res.eof) { - loff_t bound; + loff_t pos = hdr->args.offset + hdr->res.count; + unsigned int new = pos - hdr->io_start; - bound = hdr->args.offset + hdr->res.count; - spin_lock(&hdr->lock); - if (bound < hdr->io_start + hdr->good_bytes) { + if (hdr->good_bytes > new) { + hdr->good_bytes = new; set_bit(NFS_IOHDR_EOF, &hdr->flags); clear_bit(NFS_IOHDR_ERROR, &hdr->flags); - hdr->good_bytes = bound - hdr->io_start; } - spin_unlock(&hdr->lock); } else if (hdr->res.count < hdr->args.count) nfs_readpage_retry(task, hdr); } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h @@ -62,6 +62,7 @@ struct nfs_lock_context { struct nfs_open_context *open_context; fl_owner_t lockowner; atomic_t io_count; + struct rcu_head rcu_head; }; struct nfs4_state; @@ -82,6 +83,7 @@ struct nfs_open_context { struct list_head list; struct nfs4_threshold *mdsthreshold; + struct rcu_head rcu_head; }; struct nfs_open_dir_context { diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h @@ -228,6 +228,9 @@ struct nfs_server { unsigned short mountd_port; unsigned short mountd_protocol; struct rpc_wait_queue uoc_rpcwaitq; + + /* XDR related information */ + unsigned int read_hdrsize; }; /* Server capabilities */ diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h @@ -608,8 +608,13 @@ struct nfs_pgio_args { __u32 count; unsigned int pgbase; struct page ** pages; - const u32 * bitmask; /* used by write */ - enum nfs3_stable_how stable; /* used by write */ + union { + unsigned int replen; /* used by read */ + struct { + const u32 * bitmask; /* used by write */ + enum nfs3_stable_how stable; /* used by write */ + }; + }; }; struct nfs_pgio_res { @@ -617,10 +622,16 @@ struct nfs_pgio_res { struct nfs_fattr * fattr; __u32 count; __u32 op_status; - int eof; /* used by read */ - struct nfs_writeverf * verf; /* used by write */ - const struct nfs_server *server; /* used by write */ - + union { + struct { + unsigned int replen; /* used by read */ + int eof; /* used by read */ + }; + struct { + struct nfs_writeverf * verf; /* used by write */ + const struct nfs_server *server; /* used by write */ + }; + }; }; /* @@ -1471,11 +1482,10 @@ struct nfs_pgio_header { const struct nfs_rw_ops *rw_ops; struct nfs_io_completion *io_completion; struct nfs_direct_req *dreq; - spinlock_t lock; - /* fields protected by lock */ + int pnfs_error; int error; /* merge with pnfs_error */ - unsigned long good_bytes; /* boundary of good data */ + unsigned int good_bytes; /* boundary of good data */ unsigned long flags; /* diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h @@ -67,7 +67,7 @@ struct rpc_cred { const struct rpc_credops *cr_ops; unsigned long cr_expire; /* when to gc */ unsigned long cr_flags; /* various flags */ - atomic_t cr_count; /* ref count */ + refcount_t cr_count; /* ref count */ kuid_t cr_uid; @@ -100,7 +100,7 @@ struct rpc_auth { * differ from the flavor in * au_ops->au_flavor in gss * case) */ - atomic_t au_count; /* Reference counter */ + refcount_t au_count; /* Reference counter */ struct rpc_cred_cache * au_credcache; /* per-flavor data */ @@ -157,6 +157,7 @@ struct rpc_credops { int (*crkey_timeout)(struct rpc_cred *); bool (*crkey_to_expire)(struct rpc_cred *); char * (*crstringify_acceptor)(struct rpc_cred *); + bool (*crneed_reencode)(struct rpc_task *); }; extern const struct rpc_authops authunix_ops; @@ -192,6 +193,7 @@ __be32 * rpcauth_marshcred(struct rpc_task *, __be32 *); __be32 * rpcauth_checkverf(struct rpc_task *, __be32 *); int rpcauth_wrap_req(struct rpc_task *task, kxdreproc_t encode, void *rqstp, __be32 *data, void *obj); int rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp, __be32 *data, void *obj); +bool rpcauth_xmit_need_reencode(struct rpc_task *task); int rpcauth_refreshcred(struct rpc_task *); void rpcauth_invalcred(struct rpc_task *); int rpcauth_uptodatecred(struct rpc_task *); @@ -204,11 +206,11 @@ bool rpcauth_cred_key_to_expire(struct rpc_auth *, struct rpc_cred *); char * rpcauth_stringify_acceptor(struct rpc_cred *); static inline -struct rpc_cred * get_rpccred(struct rpc_cred *cred) +struct rpc_cred *get_rpccred(struct rpc_cred *cred) { - if (cred != NULL) - atomic_inc(&cred->cr_count); - return cred; + if (cred != NULL && refcount_inc_not_zero(&cred->cr_count)) + return cred; + return NULL; } /** @@ -224,9 +226,7 @@ struct rpc_cred * get_rpccred(struct rpc_cred *cred) static inline struct rpc_cred * get_rpccred_rcu(struct rpc_cred *cred) { - if (atomic_inc_not_zero(&cred->cr_count)) - return cred; - return NULL; + return get_rpccred(cred); } #endif /* __KERNEL__ */ diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h @@ -70,6 +70,7 @@ struct gss_cl_ctx { refcount_t count; enum rpc_gss_proc gc_proc; u32 gc_seq; + u32 gc_seq_xmit; spinlock_t gc_seq_lock; struct gss_ctx *gc_gss_ctx; struct xdr_netobj gc_wire_ctx; diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h @@ -34,6 +34,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef CONFIG_SUNRPC_BACKCHANNEL struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid); void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied); +void xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task); void xprt_free_bc_request(struct rpc_rqst *req); int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs); void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs); diff --git a/include/linux/sunrpc/gss_krb5.h b/include/linux/sunrpc/gss_krb5.h @@ -118,7 +118,8 @@ struct krb5_ctx { u8 acceptor_integ[GSS_KRB5_MAX_KEYLEN]; }; -extern spinlock_t krb5_seq_lock; +extern u32 gss_seq_send_fetch_and_inc(struct krb5_ctx *ctx); +extern u64 gss_seq_send64_fetch_and_inc(struct krb5_ctx *ctx); /* The length of the Kerberos GSS token header */ #define GSS_KRB5_TOK_HDR_LEN (16) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h @@ -140,8 +140,9 @@ struct rpc_task_setup { #define RPC_TASK_RUNNING 0 #define RPC_TASK_QUEUED 1 #define RPC_TASK_ACTIVE 2 -#define RPC_TASK_MSG_RECV 3 -#define RPC_TASK_MSG_RECV_WAIT 4 +#define RPC_TASK_NEED_XMIT 3 +#define RPC_TASK_NEED_RECV 4 +#define RPC_TASK_MSG_PIN_WAIT 5 #define RPC_IS_RUNNING(t) test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) #define rpc_set_running(t) set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) @@ -188,7 +189,6 @@ struct rpc_timer { struct rpc_wait_queue { spinlock_t lock; struct list_head tasks[RPC_NR_PRIORITY]; /* task queue for each priority level */ - pid_t owner; /* process id of last task serviced */ unsigned char maxpriority; /* maximum priority (0 if queue is not a priority queue) */ unsigned char priority; /* current priority */ unsigned char nr; /* # tasks remaining for cookie */ @@ -204,7 +204,6 @@ struct rpc_wait_queue { * from a single cookie. The aim is to improve * performance of NFS operations such as read/write. */ -#define RPC_BATCH_COUNT 16 #define RPC_IS_PRIORITY(q) ((q)->maxpriority > 0) /* @@ -234,6 +233,9 @@ void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq, struct rpc_task *task); void rpc_wake_up_queued_task(struct rpc_wait_queue *, struct rpc_task *); +void rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *, + struct rpc_task *, + int); void rpc_wake_up(struct rpc_wait_queue *); struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *); struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h @@ -84,7 +84,6 @@ struct svc_xprt { struct sockaddr_storage xpt_remote; /* remote peer's address */ size_t xpt_remotelen; /* length of address */ char xpt_remotebuf[INET6_ADDRSTRLEN + 10]; - struct rpc_wait_queue xpt_bc_pending; /* backchannel wait queue */ struct list_head xpt_users; /* callbacks on free */ struct net *xpt_net; diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h @@ -18,6 +18,7 @@ #include <asm/unaligned.h> #include <linux/scatterlist.h> +struct bio_vec; struct rpc_rqst; /* @@ -52,12 +53,14 @@ struct xdr_buf { struct kvec head[1], /* RPC header + non-page data */ tail[1]; /* Appended after page data */ + struct bio_vec *bvec; struct page ** pages; /* Array of pages */ unsigned int page_base, /* Start of page data */ page_len, /* Length of page data */ flags; /* Flags for data disposition */ #define XDRBUF_READ 0x01 /* target of file read */ #define XDRBUF_WRITE 0x02 /* source of file write */ +#define XDRBUF_SPARSE_PAGES 0x04 /* Page array is sparse */ unsigned int buflen, /* Total length of storage buffer */ len; /* Length of XDR encoded message */ @@ -69,6 +72,8 @@ xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) buf->head[0].iov_base = start; buf->head[0].iov_len = len; buf->tail[0].iov_len = 0; + buf->bvec = NULL; + buf->pages = NULL; buf->page_len = 0; buf->flags = 0; buf->len = 0; @@ -115,6 +120,9 @@ __be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *); void xdr_inline_pages(struct xdr_buf *, unsigned int, struct page **, unsigned int, unsigned int); void xdr_terminate_string(struct xdr_buf *, const u32); +size_t xdr_buf_pagecount(struct xdr_buf *buf); +int xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp); +void xdr_free_bvec(struct xdr_buf *buf); static inline __be32 *xdr_encode_array(__be32 *p, const void *s, unsigned int len) { @@ -177,10 +185,7 @@ struct xdr_skb_reader { typedef size_t (*xdr_skb_read_actor)(struct xdr_skb_reader *desc, void *to, size_t len); -size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len); extern int csum_partial_copy_to_xdr(struct xdr_buf *, struct sk_buff *); -extern ssize_t xdr_partial_copy_from_skb(struct xdr_buf *, unsigned int, - struct xdr_skb_reader *, xdr_skb_read_actor); extern int xdr_encode_word(struct xdr_buf *, unsigned int, u32); extern int xdr_decode_word(struct xdr_buf *, unsigned int, u32 *); diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h @@ -82,7 +82,14 @@ struct rpc_rqst { struct page **rq_enc_pages; /* scratch pages for use by gss privacy code */ void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */ - struct list_head rq_list; + + union { + struct list_head rq_list; /* Slot allocation list */ + struct rb_node rq_recv; /* Receive queue */ + }; + + struct list_head rq_xmit; /* Send queue */ + struct list_head rq_xmit2; /* Send queue */ void *rq_buffer; /* Call XDR encode buffer */ size_t rq_callsize; @@ -103,6 +110,7 @@ struct rpc_rqst { /* A cookie used to track the state of the transport connection */ + atomic_t rq_pin; /* * Partial send handling @@ -133,7 +141,8 @@ struct rpc_xprt_ops { void (*connect)(struct rpc_xprt *xprt, struct rpc_task *task); int (*buf_alloc)(struct rpc_task *task); void (*buf_free)(struct rpc_task *task); - int (*send_request)(struct rpc_task *task); + void (*prepare_request)(struct rpc_rqst *req); + int (*send_request)(struct rpc_rqst *req); void (*set_retrans_timeout)(struct rpc_task *task); void (*timer)(struct rpc_xprt *xprt, struct rpc_task *task); void (*release_request)(struct rpc_task *task); @@ -234,9 +243,12 @@ struct rpc_xprt { */ spinlock_t transport_lock; /* lock transport info */ spinlock_t reserve_lock; /* lock slot table */ - spinlock_t recv_lock; /* lock receive list */ + spinlock_t queue_lock; /* send/receive queue lock */ u32 xid; /* Next XID value to use */ struct rpc_task * snd_task; /* Task blocked in send */ + + struct list_head xmit_queue; /* Send queue */ + struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ #if defined(CONFIG_SUNRPC_BACKCHANNEL) struct svc_serv *bc_serv; /* The RPC service which will */ @@ -248,7 +260,8 @@ struct rpc_xprt { struct list_head bc_pa_list; /* List of preallocated * backchannel rpc_rqst's */ #endif /* CONFIG_SUNRPC_BACKCHANNEL */ - struct list_head recv; + + struct rb_root recv_queue; /* Receive queue */ struct { unsigned long bind_count, /* total number of binds */ @@ -325,15 +338,18 @@ struct xprt_class { struct rpc_xprt *xprt_create_transport(struct xprt_create *args); void xprt_connect(struct rpc_task *task); void xprt_reserve(struct rpc_task *task); -void xprt_request_init(struct rpc_task *task); void xprt_retry_reserve(struct rpc_task *task); int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task); int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task); void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task); void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req); -void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task); +void xprt_request_prepare(struct rpc_rqst *req); bool xprt_prepare_transmit(struct rpc_task *task); +void xprt_request_enqueue_transmit(struct rpc_task *task); +void xprt_request_enqueue_receive(struct rpc_task *task); +void xprt_request_wait_receive(struct rpc_task *task); +bool xprt_request_need_retransmit(struct rpc_task *task); void xprt_transmit(struct rpc_task *task); void xprt_end_transmit(struct rpc_task *task); int xprt_adjust_timeout(struct rpc_rqst *req); @@ -373,8 +389,8 @@ int xprt_load_transport(const char *); void xprt_set_retrans_timeout_def(struct rpc_task *task); void xprt_set_retrans_timeout_rtt(struct rpc_task *task); void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); -void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action); -void xprt_write_space(struct rpc_xprt *xprt); +void xprt_wait_for_buffer_space(struct rpc_xprt *xprt); +bool xprt_write_space(struct rpc_xprt *xprt); void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result); struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid); void xprt_update_rtt(struct rpc_task *task); @@ -382,6 +398,7 @@ void xprt_complete_rqst(struct rpc_task *task, int copied); void xprt_pin_rqst(struct rpc_rqst *req); void xprt_unpin_rqst(struct rpc_rqst *req); void xprt_release_rqst_cong(struct rpc_task *task); +bool xprt_request_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req); void xprt_disconnect_done(struct rpc_xprt *xprt); void xprt_force_disconnect(struct rpc_xprt *xprt); void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie); @@ -400,6 +417,8 @@ void xprt_unlock_connect(struct rpc_xprt *, void *); #define XPRT_BINDING (5) #define XPRT_CLOSING (6) #define XPRT_CONGESTED (9) +#define XPRT_CWND_WAIT (10) +#define XPRT_WRITE_SPACE (11) static inline void xprt_set_connected(struct rpc_xprt *xprt) { diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h @@ -30,15 +30,25 @@ struct sock_xprt { /* * State of TCP reply receive */ - __be32 tcp_fraghdr, - tcp_xid, - tcp_calldir; + struct { + struct { + __be32 fraghdr, + xid, + calldir; + } __attribute__((packed)); - u32 tcp_offset, - tcp_reclen; + u32 offset, + len; - unsigned long tcp_copied, - tcp_flags; + unsigned long copied; + } recv; + + /* + * State of TCP transmit queue + */ + struct { + u32 offset; + } xmit; /* * Connection of transports @@ -68,20 +78,8 @@ struct sock_xprt { }; /* - * TCP receive state flags - */ -#define TCP_RCV_LAST_FRAG (1UL << 0) -#define TCP_RCV_COPY_FRAGHDR (1UL << 1) -#define TCP_RCV_COPY_XID (1UL << 2) -#define TCP_RCV_COPY_DATA (1UL << 3) -#define TCP_RCV_READ_CALLDIR (1UL << 4) -#define TCP_RCV_COPY_CALLDIR (1UL << 5) - -/* * TCP RPC flags */ -#define TCP_RPC_REPLY (1UL << 6) - #define XPRT_SOCK_CONNECTING 1U #define XPRT_SOCK_DATA_READY (2) #define XPRT_SOCK_UPD_TIMEOUT (3) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h @@ -263,7 +263,7 @@ DECLARE_EVENT_CLASS(xprtrdma_mr, ); #define DEFINE_MR_EVENT(name) \ - DEFINE_EVENT(xprtrdma_mr, name, \ + DEFINE_EVENT(xprtrdma_mr, xprtrdma_mr_##name, \ TP_PROTO( \ const struct rpcrdma_mr *mr \ ), \ @@ -306,7 +306,7 @@ DECLARE_EVENT_CLASS(xprtrdma_cb_event, ** Connection events **/ -TRACE_EVENT(xprtrdma_conn_upcall, +TRACE_EVENT(xprtrdma_cm_event, TP_PROTO( const struct rpcrdma_xprt *r_xprt, struct rdma_cm_event *event @@ -377,7 +377,7 @@ DEFINE_RXPRT_EVENT(xprtrdma_reinsert); DEFINE_RXPRT_EVENT(xprtrdma_reconnect); DEFINE_RXPRT_EVENT(xprtrdma_inject_dsc); -TRACE_EVENT(xprtrdma_qp_error, +TRACE_EVENT(xprtrdma_qp_event, TP_PROTO( const struct rpcrdma_xprt *r_xprt, const struct ib_event *event @@ -509,7 +509,7 @@ TRACE_EVENT(xprtrdma_post_send, TP_STRUCT__entry( __field(const void *, req) __field(int, num_sge) - __field(bool, signaled) + __field(int, signaled) __field(int, status) ), @@ -651,11 +651,11 @@ DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg); DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li); DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_wake); -DEFINE_MR_EVENT(xprtrdma_localinv); -DEFINE_MR_EVENT(xprtrdma_dma_map); -DEFINE_MR_EVENT(xprtrdma_dma_unmap); -DEFINE_MR_EVENT(xprtrdma_remoteinv); -DEFINE_MR_EVENT(xprtrdma_recover_mr); +DEFINE_MR_EVENT(localinv); +DEFINE_MR_EVENT(map); +DEFINE_MR_EVENT(unmap); +DEFINE_MR_EVENT(remoteinv); +DEFINE_MR_EVENT(recycle); /** ** Reply events diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h @@ -470,14 +470,14 @@ TRACE_EVENT(xprt_ping, __get_str(addr), __get_str(port), __entry->status) ); -TRACE_EVENT(xs_tcp_data_ready, - TP_PROTO(struct rpc_xprt *xprt, int err, unsigned int total), +TRACE_EVENT(xs_stream_read_data, + TP_PROTO(struct rpc_xprt *xprt, ssize_t err, size_t total), TP_ARGS(xprt, err, total), TP_STRUCT__entry( - __field(int, err) - __field(unsigned int, total) + __field(ssize_t, err) + __field(size_t, total) __string(addr, xprt ? xprt->address_strings[RPC_DISPLAY_ADDR] : "(null)") __string(port, xprt ? xprt->address_strings[RPC_DISPLAY_PORT] : @@ -493,21 +493,11 @@ TRACE_EVENT(xs_tcp_data_ready, xprt->address_strings[RPC_DISPLAY_PORT] : "(null)"); ), - TP_printk("peer=[%s]:%s err=%d total=%u", __get_str(addr), + TP_printk("peer=[%s]:%s err=%zd total=%zu", __get_str(addr), __get_str(port), __entry->err, __entry->total) ); -#define rpc_show_sock_xprt_flags(flags) \ - __print_flags(flags, "|", \ - { TCP_RCV_LAST_FRAG, "TCP_RCV_LAST_FRAG" }, \ - { TCP_RCV_COPY_FRAGHDR, "TCP_RCV_COPY_FRAGHDR" }, \ - { TCP_RCV_COPY_XID, "TCP_RCV_COPY_XID" }, \ - { TCP_RCV_COPY_DATA, "TCP_RCV_COPY_DATA" }, \ - { TCP_RCV_READ_CALLDIR, "TCP_RCV_READ_CALLDIR" }, \ - { TCP_RCV_COPY_CALLDIR, "TCP_RCV_COPY_CALLDIR" }, \ - { TCP_RPC_REPLY, "TCP_RPC_REPLY" }) - -TRACE_EVENT(xs_tcp_data_recv, +TRACE_EVENT(xs_stream_read_request, TP_PROTO(struct sock_xprt *xs), TP_ARGS(xs), @@ -516,25 +506,22 @@ TRACE_EVENT(xs_tcp_data_recv, __string(addr, xs->xprt.address_strings[RPC_DISPLAY_ADDR]) __string(port, xs->xprt.address_strings[RPC_DISPLAY_PORT]) __field(u32, xid) - __field(unsigned long, flags) __field(unsigned long, copied) __field(unsigned int, reclen) - __field(unsigned long, offset) + __field(unsigned int, offset) ), TP_fast_assign( __assign_str(addr, xs->xprt.address_strings[RPC_DISPLAY_ADDR]); __assign_str(port, xs->xprt.address_strings[RPC_DISPLAY_PORT]); - __entry->xid = be32_to_cpu(xs->tcp_xid); - __entry->flags = xs->tcp_flags; - __entry->copied = xs->tcp_copied; - __entry->reclen = xs->tcp_reclen; - __entry->offset = xs->tcp_offset; + __entry->xid = be32_to_cpu(xs->recv.xid); + __entry->copied = xs->recv.copied; + __entry->reclen = xs->recv.len; + __entry->offset = xs->recv.offset; ), - TP_printk("peer=[%s]:%s xid=0x%08x flags=%s copied=%lu reclen=%u offset=%lu", + TP_printk("peer=[%s]:%s xid=0x%08x copied=%lu reclen=%u offset=%u", __get_str(addr), __get_str(port), __entry->xid, - rpc_show_sock_xprt_flags(__entry->flags), __entry->copied, __entry->reclen, __entry->offset) ); diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c @@ -30,10 +30,9 @@ struct rpc_cred_cache { static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS; -static DEFINE_SPINLOCK(rpc_authflavor_lock); -static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = { - &authnull_ops, /* AUTH_NULL */ - &authunix_ops, /* AUTH_UNIX */ +static const struct rpc_authops __rcu *auth_flavors[RPC_AUTH_MAXFLAVOR] = { + [RPC_AUTH_NULL] = (const struct rpc_authops __force __rcu *)&authnull_ops, + [RPC_AUTH_UNIX] = (const struct rpc_authops __force __rcu *)&authunix_ops, NULL, /* others can be loadable modules */ }; @@ -93,39 +92,65 @@ pseudoflavor_to_flavor(u32 flavor) { int rpcauth_register(const struct rpc_authops *ops) { + const struct rpc_authops *old; rpc_authflavor_t flavor; - int ret = -EPERM; if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) return -EINVAL; - spin_lock(&rpc_authflavor_lock); - if (auth_flavors[flavor] == NULL) { - auth_flavors[flavor] = ops; - ret = 0; - } - spin_unlock(&rpc_authflavor_lock); - return ret; + old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], NULL, ops); + if (old == NULL || old == ops) + return 0; + return -EPERM; } EXPORT_SYMBOL_GPL(rpcauth_register); int rpcauth_unregister(const struct rpc_authops *ops) { + const struct rpc_authops *old; rpc_authflavor_t flavor; - int ret = -EPERM; if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) return -EINVAL; - spin_lock(&rpc_authflavor_lock); - if (auth_flavors[flavor] == ops) { - auth_flavors[flavor] = NULL; - ret = 0; - } - spin_unlock(&rpc_authflavor_lock); - return ret; + + old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], ops, NULL); + if (old == ops || old == NULL) + return 0; + return -EPERM; } EXPORT_SYMBOL_GPL(rpcauth_unregister); +static const struct rpc_authops * +rpcauth_get_authops(rpc_authflavor_t flavor) +{ + const struct rpc_authops *ops; + + if (flavor >= RPC_AUTH_MAXFLAVOR) + return NULL; + + rcu_read_lock(); + ops = rcu_dereference(auth_flavors[flavor]); + if (ops == NULL) { + rcu_read_unlock(); + request_module("rpc-auth-%u", flavor); + rcu_read_lock(); + ops = rcu_dereference(auth_flavors[flavor]); + if (ops == NULL) + goto out; + } + if (!try_module_get(ops->owner)) + ops = NULL; +out: + rcu_read_unlock(); + return ops; +} + +static void +rpcauth_put_authops(const struct rpc_authops *ops) +{ + module_put(ops->owner); +} + /** * rpcauth_get_pseudoflavor - check if security flavor is supported * @flavor: a security flavor @@ -138,25 +163,16 @@ EXPORT_SYMBOL_GPL(rpcauth_unregister); rpc_authflavor_t rpcauth_get_pseudoflavor(rpc_authflavor_t flavor, struct rpcsec_gss_info *info) { - const struct rpc_authops *ops; + const struct rpc_authops *ops = rpcauth_get_authops(flavor); rpc_authflavor_t pseudoflavor; - ops = auth_flavors[flavor]; - if (ops == NULL) - request_module("rpc-auth-%u", flavor); - spin_lock(&rpc_authflavor_lock); - ops = auth_flavors[flavor]; - if (ops == NULL || !try_module_get(ops->owner)) { - spin_unlock(&rpc_authflavor_lock); + if (!ops) return RPC_AUTH_MAXFLAVOR; - } - spin_unlock(&rpc_authflavor_lock); - pseudoflavor = flavor; if (ops->info2flavor != NULL) pseudoflavor = ops->info2flavor(info); - module_put(ops->owner); + rpcauth_put_authops(ops); return pseudoflavor; } EXPORT_SYMBOL_GPL(rpcauth_get_pseudoflavor); @@ -176,25 +192,15 @@ rpcauth_get_gssinfo(rpc_authflavor_t pseudoflavor, struct rpcsec_gss_info *info) const struct rpc_authops *ops; int result; - if (flavor >= RPC_AUTH_MAXFLAVOR) - return -EINVAL; - - ops = auth_flavors[flavor]; + ops = rpcauth_get_authops(flavor); if (ops == NULL) - request_module("rpc-auth-%u", flavor); - spin_lock(&rpc_authflavor_lock); - ops = auth_flavors[flavor]; - if (ops == NULL || !try_module_get(ops->owner)) { - spin_unlock(&rpc_authflavor_lock); return -ENOENT; - } - spin_unlock(&rpc_authflavor_lock); result = -ENOENT; if (ops->flavor2info != NULL) result = ops->flavor2info(pseudoflavor, info); - module_put(ops->owner); + rpcauth_put_authops(ops); return result; } EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo); @@ -212,15 +218,13 @@ EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo); int rpcauth_list_flavors(rpc_authflavor_t *array, int size) { - rpc_authflavor_t flavor; - int result = 0; + const struct rpc_authops *ops; + rpc_authflavor_t flavor, pseudos[4]; + int i, len, result = 0; - spin_lock(&rpc_authflavor_lock); + rcu_read_lock(); for (flavor = 0; flavor < RPC_AUTH_MAXFLAVOR; flavor++) { - const struct rpc_authops *ops = auth_flavors[flavor]; - rpc_authflavor_t pseudos[4]; - int i, len; - + ops = rcu_dereference(auth_flavors[flavor]); if (result >= size) { result = -ENOMEM; break; @@ -245,7 +249,7 @@ rpcauth_list_flavors(rpc_authflavor_t *array, int size) array[result++] = pseudos[i]; } } - spin_unlock(&rpc_authflavor_lock); + rcu_read_unlock(); dprintk("RPC: %s returns %d\n", __func__, result); return result; @@ -255,25 +259,17 @@ EXPORT_SYMBOL_GPL(rpcauth_list_flavors); struct rpc_auth * rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { - struct rpc_auth *auth; + struct rpc_auth *auth = ERR_PTR(-EINVAL); const struct rpc_authops *ops; - u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor); + u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor); - auth = ERR_PTR(-EINVAL); - if (flavor >= RPC_AUTH_MAXFLAVOR) + ops = rpcauth_get_authops(flavor); + if (ops == NULL) goto out; - if ((ops = auth_flavors[flavor]) == NULL) - request_module("rpc-auth-%u", flavor); - spin_lock(&rpc_authflavor_lock); - ops = auth_flavors[flavor]; - if (ops == NULL || !try_module_get(ops->owner)) { - spin_unlock(&rpc_authflavor_lock); - goto out; - } - spin_unlock(&rpc_authflavor_lock); auth = ops->create(args, clnt); - module_put(ops->owner); + + rpcauth_put_authops(ops); if (IS_ERR(auth)) return auth; if (clnt->cl_auth) @@ -288,32 +284,37 @@ EXPORT_SYMBOL_GPL(rpcauth_create); void rpcauth_release(struct rpc_auth *auth) { - if (!atomic_dec_and_test(&auth->au_count)) + if (!refcount_dec_and_test(&auth->au_count)) return; auth->au_ops->destroy(auth); } static DEFINE_SPINLOCK(rpc_credcache_lock); -static void +/* + * On success, the caller is responsible for freeing the reference + * held by the hashtable + */ +static bool rpcauth_unhash_cred_locked(struct rpc_cred *cred) { + if (!test_and_clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) + return false; hlist_del_rcu(&cred->cr_hash); - smp_mb__before_atomic(); - clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); + return true; } -static int +static bool rpcauth_unhash_cred(struct rpc_cred *cred) { spinlock_t *cache_lock; - int ret; + bool ret; + if (!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) + return false; cache_lock = &cred->cr_auth->au_credcache->lock; spin_lock(cache_lock); - ret = atomic_read(&cred->cr_count) == 0; - if (ret) - rpcauth_unhash_cred_locked(cred); + ret = rpcauth_unhash_cred_locked(cred); spin_unlock(cache_lock); return ret; } @@ -392,6 +393,44 @@ void rpcauth_destroy_credlist(struct list_head *head) } } +static void +rpcauth_lru_add_locked(struct rpc_cred *cred) +{ + if (!list_empty(&cred->cr_lru)) + return; + number_cred_unused++; + list_add_tail(&cred->cr_lru, &cred_unused); +} + +static void +rpcauth_lru_add(struct rpc_cred *cred) +{ + if (!list_empty(&cred->cr_lru)) + return; + spin_lock(&rpc_credcache_lock); + rpcauth_lru_add_locked(cred); + spin_unlock(&rpc_credcache_lock); +} + +static void +rpcauth_lru_remove_locked(struct rpc_cred *cred) +{ + if (list_empty(&cred->cr_lru)) + return; + number_cred_unused--; + list_del_init(&cred->cr_lru); +} + +static void +rpcauth_lru_remove(struct rpc_cred *cred) +{ + if (list_empty(&cred->cr_lru)) + return; + spin_lock(&rpc_credcache_lock); + rpcauth_lru_remove_locked(cred); + spin_unlock(&rpc_credcache_lock); +} + /* * Clear the RPC credential cache, and delete those credentials * that are not referenced. @@ -411,13 +450,10 @@ rpcauth_clear_credcache(struct rpc_cred_cache *cache) head = &cache->hashtable[i]; while (!hlist_empty(head)) { cred = hlist_entry(head->first, struct rpc_cred, cr_hash); - get_rpccred(cred); - if (!list_empty(&cred->cr_lru)) { - list_del(&cred->cr_lru); - number_cred_unused--; - } - list_add_tail(&cred->cr_lru, &free); rpcauth_unhash_cred_locked(cred); + /* Note: We now hold a reference to cred */ + rpcauth_lru_remove_locked(cred); + list_add_tail(&cred->cr_lru, &free); } } spin_unlock(&cache->lock); @@ -451,7 +487,6 @@ EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache); static long rpcauth_prune_expired(struct list_head *free, int nr_to_scan) { - spinlock_t *cache_lock; struct rpc_cred *cred, *next; unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM; long freed = 0; @@ -460,32 +495,24 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan) if (nr_to_scan-- == 0) break; + if (refcount_read(&cred->cr_count) > 1) { + rpcauth_lru_remove_locked(cred); + continue; + } /* * Enforce a 60 second garbage collection moratorium * Note that the cred_unused list must be time-ordered. */ - if (time_in_range(cred->cr_expire, expired, jiffies) && - test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) { - freed = SHRINK_STOP; - break; - } - - list_del_init(&cred->cr_lru); - number_cred_unused--; - freed++; - if (atomic_read(&cred->cr_count) != 0) + if (!time_in_range(cred->cr_expire, expired, jiffies)) + continue; + if (!rpcauth_unhash_cred(cred)) continue; - cache_lock = &cred->cr_auth->au_credcache->lock; - spin_lock(cache_lock); - if (atomic_read(&cred->cr_count) == 0) { - get_rpccred(cred); - list_add_tail(&cred->cr_lru, free); - rpcauth_unhash_cred_locked(cred); - } - spin_unlock(cache_lock); + rpcauth_lru_remove_locked(cred); + freed++; + list_add_tail(&cred->cr_lru, free); } - return freed; + return freed ? freed : SHRINK_STOP; } static unsigned long @@ -561,19 +588,15 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; if (flags & RPCAUTH_LOOKUP_RCU) { - if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) && - !test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags)) - cred = entry; + if (test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags) || + refcount_read(&entry->cr_count) == 0) + continue; + cred = entry; break; } - spin_lock(&cache->lock); - if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) { - spin_unlock(&cache->lock); - continue; - } cred = get_rpccred(entry); - spin_unlock(&cache->lock); - break; + if (cred) + break; } rcu_read_unlock(); @@ -594,11 +617,13 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; cred = get_rpccred(entry); - break; + if (cred) + break; } if (cred == NULL) { cred = new; set_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); + refcount_inc(&cred->cr_count); hlist_add_head_rcu(&cred->cr_hash, &cache->hashtable[nr]); } else list_add_tail(&new->cr_lru, &free); @@ -645,7 +670,7 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, { INIT_HLIST_NODE(&cred->cr_hash); INIT_LIST_HEAD(&cred->cr_lru); - atomic_set(&cred->cr_count, 1); + refcount_set(&cred->cr_count, 1); cred->cr_auth = auth; cred->cr_ops = ops; cred->cr_expire = jiffies; @@ -713,36 +738,29 @@ put_rpccred(struct rpc_cred *cred) { if (cred == NULL) return; - /* Fast path for unhashed credentials */ - if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) { - if (atomic_dec_and_test(&cred->cr_count)) - cred->cr_ops->crdestroy(cred); - return; - } - - if (!atomic_dec_and_lock(&cred->cr_count, &rpc_credcache_lock)) - return; - if (!list_empty(&cred->cr_lru)) { - number_cred_unused--; - list_del_init(&cred->cr_lru); - } - if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) { - if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) { - cred->cr_expire = jiffies; - list_add_tail(&cred->cr_lru, &cred_unused); - number_cred_unused++; - goto out_nodestroy; - } - if (!rpcauth_unhash_cred(cred)) { - /* We were hashed and someone looked us up... */ - goto out_nodestroy; - } + rcu_read_lock(); + if (refcount_dec_and_test(&cred->cr_count)) + goto destroy; + if (refcount_read(&cred->cr_count) != 1 || + !test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) + goto out; + if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) { + cred->cr_expire = jiffies; + rpcauth_lru_add(cred); + /* Race breaker */ + if (unlikely(!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags))) + rpcauth_lru_remove(cred); + } else if (rpcauth_unhash_cred(cred)) { + rpcauth_lru_remove(cred); + if (refcount_dec_and_test(&cred->cr_count)) + goto destroy; } - spin_unlock(&rpc_credcache_lock); - cred->cr_ops->crdestroy(cred); +out: + rcu_read_unlock(); return; -out_nodestroy: - spin_unlock(&rpc_credcache_lock); +destroy: + rcu_read_unlock(); + cred->cr_ops->crdestroy(cred); } EXPORT_SYMBOL_GPL(put_rpccred); @@ -817,6 +835,16 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp, return rpcauth_unwrap_req_decode(decode, rqstp, data, obj); } +bool +rpcauth_xmit_need_reencode(struct rpc_task *task) +{ + struct rpc_cred *cred = task->tk_rqstp->rq_cred; + + if (!cred || !cred->cr_ops->crneed_reencode) + return false; + return cred->cr_ops->crneed_reencode(task); +} + int rpcauth_refreshcred(struct rpc_task *task) { diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c @@ -274,7 +274,7 @@ static const struct rpc_authops generic_auth_ops = { static struct rpc_auth generic_auth = { .au_ops = &generic_auth_ops, - .au_count = ATOMIC_INIT(0), + .au_count = REFCOUNT_INIT(1), }; static bool generic_key_to_expire(struct rpc_cred *cred) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c @@ -1058,7 +1058,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) auth->au_flavor = flavor; if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) auth->au_flags |= RPCAUTH_AUTH_DATATOUCH; - atomic_set(&auth->au_count, 1); + refcount_set(&auth->au_count, 1); kref_init(&gss_auth->kref); err = rpcauth_init_credcache(auth); @@ -1187,7 +1187,7 @@ gss_auth_find_or_add_hashed(const struct rpc_auth_create_args *args, if (strcmp(gss_auth->target_name, args->target_name)) continue; } - if (!atomic_inc_not_zero(&gss_auth->rpc_auth.au_count)) + if (!refcount_inc_not_zero(&gss_auth->rpc_auth.au_count)) continue; goto out; } @@ -1984,6 +1984,46 @@ gss_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp, return decode(rqstp, &xdr, obj); } +static bool +gss_seq_is_newer(u32 new, u32 old) +{ + return (s32)(new - old) > 0; +} + +static bool +gss_xmit_need_reencode(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_cred *cred = req->rq_cred; + struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + u32 win, seq_xmit; + bool ret = true; + + if (!ctx) + return true; + + if (gss_seq_is_newer(req->rq_seqno, READ_ONCE(ctx->gc_seq))) + goto out; + + seq_xmit = READ_ONCE(ctx->gc_seq_xmit); + while (gss_seq_is_newer(req->rq_seqno, seq_xmit)) { + u32 tmp = seq_xmit; + + seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, req->rq_seqno); + if (seq_xmit == tmp) { + ret = false; + goto out; + } + } + + win = ctx->gc_win; + if (win > 0) + ret = !gss_seq_is_newer(req->rq_seqno, seq_xmit - win); +out: + gss_put_ctx(ctx); + return ret; +} + static int gss_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp, __be32 *p, void *obj) @@ -2052,6 +2092,7 @@ static const struct rpc_credops gss_credops = { .crunwrap_resp = gss_unwrap_resp, .crkey_timeout = gss_key_timeout, .crstringify_acceptor = gss_stringify_acceptor, + .crneed_reencode = gss_xmit_need_reencode, }; static const struct rpc_credops gss_nullops = { diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -63,13 +63,12 @@ #include <linux/sunrpc/gss_krb5.h> #include <linux/random.h> #include <linux/crypto.h> +#include <linux/atomic.h> #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif -DEFINE_SPINLOCK(krb5_seq_lock); - static void * setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token) { @@ -124,6 +123,30 @@ setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) return krb5_hdr; } +u32 +gss_seq_send_fetch_and_inc(struct krb5_ctx *ctx) +{ + u32 old, seq_send = READ_ONCE(ctx->seq_send); + + do { + old = seq_send; + seq_send = cmpxchg(&ctx->seq_send, old, old + 1); + } while (old != seq_send); + return seq_send; +} + +u64 +gss_seq_send64_fetch_and_inc(struct krb5_ctx *ctx) +{ + u64 old, seq_send = READ_ONCE(ctx->seq_send); + + do { + old = seq_send; + seq_send = cmpxchg64(&ctx->seq_send64, old, old + 1); + } while (old != seq_send); + return seq_send; +} + static u32 gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, struct xdr_netobj *token) @@ -154,9 +177,7 @@ gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - spin_lock(&krb5_seq_lock); - seq_send = ctx->seq_send++; - spin_unlock(&krb5_seq_lock); + seq_send = gss_seq_send_fetch_and_inc(ctx); if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff, seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)) @@ -174,7 +195,6 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, .data = cksumdata}; void *krb5_hdr; s32 now; - u64 seq_send; u8 *cksumkey; unsigned int cksum_usage; __be64 seq_send_be64; @@ -185,11 +205,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, /* Set up the sequence number. Now 64-bits in clear * text and w/o direction indicator */ - spin_lock(&krb5_seq_lock); - seq_send = ctx->seq_send64++; - spin_unlock(&krb5_seq_lock); - - seq_send_be64 = cpu_to_be64(seq_send); + seq_send_be64 = cpu_to_be64(gss_seq_send64_fetch_and_inc(ctx)); memcpy(krb5_hdr + 8, (char *) &seq_send_be64, 8); if (ctx->initiate) { diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -228,9 +228,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - spin_lock(&krb5_seq_lock); - seq_send = kctx->seq_send++; - spin_unlock(&krb5_seq_lock); + seq_send = gss_seq_send_fetch_and_inc(kctx); /* XXX would probably be more efficient to compute checksum * and encrypt at the same time: */ @@ -477,9 +475,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, *be16ptr++ = 0; be64ptr = (__be64 *)be16ptr; - spin_lock(&krb5_seq_lock); - *be64ptr = cpu_to_be64(kctx->seq_send64++); - spin_unlock(&krb5_seq_lock); + *be64ptr = cpu_to_be64(gss_seq_send64_fetch_and_inc(kctx)); err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages); if (err) diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -117,7 +117,7 @@ int gss_mech_register(struct gss_api_mech *gm) if (status) return status; spin_lock(&registered_mechs_lock); - list_add(&gm->gm_list, &registered_mechs); + list_add_rcu(&gm->gm_list, &registered_mechs); spin_unlock(&registered_mechs_lock); dprintk("RPC: registered gss mechanism %s\n", gm->gm_name); return 0; @@ -132,7 +132,7 @@ EXPORT_SYMBOL_GPL(gss_mech_register); void gss_mech_unregister(struct gss_api_mech *gm) { spin_lock(&registered_mechs_lock); - list_del(&gm->gm_list); + list_del_rcu(&gm->gm_list); spin_unlock(&registered_mechs_lock); dprintk("RPC: unregistered gss mechanism %s\n", gm->gm_name); gss_mech_free(gm); @@ -151,15 +151,15 @@ _gss_mech_get_by_name(const char *name) { struct gss_api_mech *pos, *gm = NULL; - spin_lock(&registered_mechs_lock); - list_for_each_entry(pos, &registered_mechs, gm_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, &registered_mechs, gm_list) { if (0 == strcmp(name, pos->gm_name)) { if (try_module_get(pos->gm_owner)) gm = pos; break; } } - spin_unlock(&registered_mechs_lock); + rcu_read_unlock(); return gm; } @@ -186,8 +186,8 @@ struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj) dprintk("RPC: %s(%s)\n", __func__, buf); request_module("rpc-auth-gss-%s", buf); - spin_lock(&registered_mechs_lock); - list_for_each_entry(pos, &registered_mechs, gm_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, &registered_mechs, gm_list) { if (obj->len == pos->gm_oid.len) { if (0 == memcmp(obj->data, pos->gm_oid.data, obj->len)) { if (try_module_get(pos->gm_owner)) @@ -196,7 +196,7 @@ struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj) } } } - spin_unlock(&registered_mechs_lock); + rcu_read_unlock(); return gm; } @@ -216,15 +216,15 @@ static struct gss_api_mech *_gss_mech_get_by_pseudoflavor(u32 pseudoflavor) { struct gss_api_mech *gm = NULL, *pos; - spin_lock(&registered_mechs_lock); - list_for_each_entry(pos, &registered_mechs, gm_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, &registered_mechs, gm_list) { if (!mech_supports_pseudoflavor(pos, pseudoflavor)) continue; if (try_module_get(pos->gm_owner)) gm = pos; break; } - spin_unlock(&registered_mechs_lock); + rcu_read_unlock(); return gm; } @@ -257,8 +257,8 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size) struct gss_api_mech *pos = NULL; int j, i = 0; - spin_lock(&registered_mechs_lock); - list_for_each_entry(pos, &registered_mechs, gm_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, &registered_mechs, gm_list) { for (j = 0; j < pos->gm_pf_num; j++) { if (i >= size) { spin_unlock(&registered_mechs_lock); @@ -267,7 +267,7 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size) array_ptr[i++] = pos->gm_pfs[j].pseudoflavor; } } - spin_unlock(&registered_mechs_lock); + rcu_read_unlock(); return i; } diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -784,6 +784,7 @@ void gssx_enc_accept_sec_context(struct rpc_rqst *req, xdr_inline_pages(&req->rq_rcv_buf, PAGE_SIZE/2 /* pretty arbitrary */, arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE); + req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; done: if (err) dprintk("RPC: gssx_enc_accept_sec_context: %d\n", err); diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c @@ -21,7 +21,7 @@ static struct rpc_cred null_cred; static struct rpc_auth * nul_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { - atomic_inc(&null_auth.au_count); + refcount_inc(&null_auth.au_count); return &null_auth; } @@ -119,7 +119,7 @@ struct rpc_auth null_auth = { .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authnull_ops, .au_flavor = RPC_AUTH_NULL, - .au_count = ATOMIC_INIT(0), + .au_count = REFCOUNT_INIT(1), }; static @@ -138,6 +138,6 @@ struct rpc_cred null_cred = { .cr_lru = LIST_HEAD_INIT(null_cred.cr_lru), .cr_auth = &null_auth, .cr_ops = &null_credops, - .cr_count = ATOMIC_INIT(1), + .cr_count = REFCOUNT_INIT(2), .cr_flags = 1UL << RPCAUTH_CRED_UPTODATE, }; diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c @@ -34,7 +34,7 @@ unx_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { dprintk("RPC: creating UNIX authenticator for client %p\n", clnt); - atomic_inc(&unix_auth.au_count); + refcount_inc(&unix_auth.au_count); return &unix_auth; } @@ -239,7 +239,7 @@ struct rpc_auth unix_auth = { .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authunix_ops, .au_flavor = RPC_AUTH_UNIX, - .au_count = ATOMIC_INIT(0), + .au_count = REFCOUNT_INIT(1), }; static diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c @@ -91,7 +91,6 @@ struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt, gfp_t gfp_flags) return NULL; req->rq_xprt = xprt; - INIT_LIST_HEAD(&req->rq_list); INIT_LIST_HEAD(&req->rq_bc_list); /* Preallocate one XDR receive buffer */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c @@ -61,6 +61,7 @@ static void call_start(struct rpc_task *task); static void call_reserve(struct rpc_task *task); static void call_reserveresult(struct rpc_task *task); static void call_allocate(struct rpc_task *task); +static void call_encode(struct rpc_task *task); static void call_decode(struct rpc_task *task); static void call_bind(struct rpc_task *task); static void call_bind_status(struct rpc_task *task); @@ -1137,10 +1138,10 @@ EXPORT_SYMBOL_GPL(rpc_call_async); struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) { struct rpc_task *task; - struct xdr_buf *xbufp = &req->rq_snd_buf; struct rpc_task_setup task_setup_data = { .callback_ops = &rpc_default_ops, - .flags = RPC_TASK_SOFTCONN, + .flags = RPC_TASK_SOFTCONN | + RPC_TASK_NO_RETRANS_TIMEOUT, }; dprintk("RPC: rpc_run_bc_task req= %p\n", req); @@ -1148,14 +1149,7 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) * Create an rpc_task to send the data */ task = rpc_new_task(&task_setup_data); - task->tk_rqstp = req; - - /* - * Set up the xdr_buf length. - * This also indicates that the buffer is XDR encoded already. - */ - xbufp->len = xbufp->head[0].iov_len + xbufp->page_len + - xbufp->tail[0].iov_len; + xprt_init_bc_request(req, task); task->tk_action = call_bc_transmit; atomic_inc(&task->tk_count); @@ -1558,7 +1552,6 @@ call_reserveresult(struct rpc_task *task) task->tk_status = 0; if (status >= 0) { if (task->tk_rqstp) { - xprt_request_init(task); task->tk_action = call_refresh; return; } @@ -1680,7 +1673,7 @@ call_allocate(struct rpc_task *task) dprint_status(task); task->tk_status = 0; - task->tk_action = call_bind; + task->tk_action = call_encode; if (req->rq_buffer) return; @@ -1721,22 +1714,15 @@ call_allocate(struct rpc_task *task) rpc_exit(task, -ERESTARTSYS); } -static inline int +static int rpc_task_need_encode(struct rpc_task *task) { - return task->tk_rqstp->rq_snd_buf.len == 0; + return test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) == 0 && + (!(task->tk_flags & RPC_TASK_SENT) || + !(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) || + xprt_request_need_retransmit(task)); } -static inline void -rpc_task_force_reencode(struct rpc_task *task) -{ - task->tk_rqstp->rq_snd_buf.len = 0; - task->tk_rqstp->rq_bytes_sent = 0; -} - -/* - * 3. Encode arguments of an RPC call - */ static void rpc_xdr_encode(struct rpc_task *task) { @@ -1752,6 +1738,7 @@ rpc_xdr_encode(struct rpc_task *task) xdr_buf_init(&req->rq_rcv_buf, req->rq_rbuffer, req->rq_rcvsize); + req->rq_bytes_sent = 0; p = rpc_encode_header(task); if (p == NULL) { @@ -1766,6 +1753,36 @@ rpc_xdr_encode(struct rpc_task *task) task->tk_status = rpcauth_wrap_req(task, encode, req, p, task->tk_msg.rpc_argp); + if (task->tk_status == 0) + xprt_request_prepare(req); +} + +/* + * 3. Encode arguments of an RPC call + */ +static void +call_encode(struct rpc_task *task) +{ + if (!rpc_task_need_encode(task)) + goto out; + /* Encode here so that rpcsec_gss can use correct sequence number. */ + rpc_xdr_encode(task); + /* Did the encode result in an error condition? */ + if (task->tk_status != 0) { + /* Was the error nonfatal? */ + if (task->tk_status == -EAGAIN || task->tk_status == -ENOMEM) + rpc_delay(task, HZ >> 4); + else + rpc_exit(task, task->tk_status); + return; + } + + /* Add task to reply queue before transmission to avoid races */ + if (rpc_reply_expected(task)) + xprt_request_enqueue_receive(task); + xprt_request_enqueue_transmit(task); +out: + task->tk_action = call_bind; } /* @@ -1947,43 +1964,16 @@ call_connect_status(struct rpc_task *task) static void call_transmit(struct rpc_task *task) { - int is_retrans = RPC_WAS_SENT(task); - dprint_status(task); - task->tk_action = call_status; - if (task->tk_status < 0) - return; - if (!xprt_prepare_transmit(task)) - return; - task->tk_action = call_transmit_status; - /* Encode here so that rpcsec_gss can use correct sequence number. */ - if (rpc_task_need_encode(task)) { - rpc_xdr_encode(task); - /* Did the encode result in an error condition? */ - if (task->tk_status != 0) { - /* Was the error nonfatal? */ - if (task->tk_status == -EAGAIN) - rpc_delay(task, HZ >> 4); - else - rpc_exit(task, task->tk_status); + task->tk_status = 0; + if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) { + if (!xprt_prepare_transmit(task)) return; - } + xprt_transmit(task); } - xprt_transmit(task); - if (task->tk_status < 0) - return; - if (is_retrans) - task->tk_client->cl_stats->rpcretrans++; - /* - * On success, ensure that we call xprt_end_transmit() before sleeping - * in order to allow access to the socket to other RPC requests. - */ - call_transmit_status(task); - if (rpc_reply_expected(task)) - return; - task->tk_action = rpc_exit_task; - rpc_wake_up_queued_task(&task->tk_rqstp->rq_xprt->pending, task); + task->tk_action = call_transmit_status; + xprt_end_transmit(task); } /* @@ -1999,19 +1989,17 @@ call_transmit_status(struct rpc_task *task) * test first. */ if (task->tk_status == 0) { - xprt_end_transmit(task); - rpc_task_force_reencode(task); + xprt_request_wait_receive(task); return; } switch (task->tk_status) { - case -EAGAIN: - case -ENOBUFS: - break; default: dprint_status(task); - xprt_end_transmit(task); - rpc_task_force_reencode(task); + break; + case -EBADMSG: + task->tk_status = 0; + task->tk_action = call_encode; break; /* * Special cases: if we've been waiting on the @@ -2019,6 +2007,14 @@ call_transmit_status(struct rpc_task *task) * socket just returned a connection error, * then hold onto the transport lock. */ + case -ENOBUFS: + rpc_delay(task, HZ>>2); + /* fall through */ + case -EBADSLT: + case -EAGAIN: + task->tk_action = call_transmit; + task->tk_status = 0; + break; case -ECONNREFUSED: case -EHOSTDOWN: case -ENETDOWN: @@ -2026,7 +2022,6 @@ call_transmit_status(struct rpc_task *task) case -ENETUNREACH: case -EPERM: if (RPC_IS_SOFTCONN(task)) { - xprt_end_transmit(task); if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, task->tk_status); @@ -2039,7 +2034,7 @@ call_transmit_status(struct rpc_task *task) case -EADDRINUSE: case -ENOTCONN: case -EPIPE: - rpc_task_force_reencode(task); + break; } } @@ -2053,6 +2048,11 @@ call_bc_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; + if (rpc_task_need_encode(task)) + xprt_request_enqueue_transmit(task); + if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + goto out_wakeup; + if (!xprt_prepare_transmit(task)) goto out_retry; @@ -2061,14 +2061,9 @@ call_bc_transmit(struct rpc_task *task) "error: %d\n", task->tk_status); goto out_done; } - if (req->rq_connect_cookie != req->rq_xprt->connect_cookie) - req->rq_bytes_sent = 0; xprt_transmit(task); - if (task->tk_status == -EAGAIN) - goto out_nospace; - xprt_end_transmit(task); dprint_status(task); switch (task->tk_status) { @@ -2084,6 +2079,8 @@ call_bc_transmit(struct rpc_task *task) case -ENOTCONN: case -EPIPE: break; + case -EAGAIN: + goto out_retry; case -ETIMEDOUT: /* * Problem reaching the server. Disconnect and let the @@ -2107,12 +2104,11 @@ call_bc_transmit(struct rpc_task *task) "error: %d\n", task->tk_status); break; } +out_wakeup: rpc_wake_up_queued_task(&req->rq_xprt->pending, task); out_done: task->tk_action = rpc_exit_task; return; -out_nospace: - req->rq_connect_cookie = req->rq_xprt->connect_cookie; out_retry: task->tk_status = 0; } @@ -2125,15 +2121,11 @@ static void call_status(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_rqst *req = task->tk_rqstp; int status; if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, task->tk_status); - if (req->rq_reply_bytes_recvd > 0 && !req->rq_bytes_sent) - task->tk_status = req->rq_reply_bytes_recvd; - dprint_status(task); status = task->tk_status; @@ -2173,13 +2165,8 @@ call_status(struct rpc_task *task) /* fall through */ case -EPIPE: case -ENOTCONN: - task->tk_action = call_bind; - break; - case -ENOBUFS: - rpc_delay(task, HZ>>2); - /* fall through */ case -EAGAIN: - task->tk_action = call_transmit; + task->tk_action = call_encode; break; case -EIO: /* shutdown or soft timeout */ @@ -2244,7 +2231,7 @@ call_timeout(struct rpc_task *task) rpcauth_invalcred(task); retry: - task->tk_action = call_bind; + task->tk_action = call_encode; task->tk_status = 0; } @@ -2261,6 +2248,11 @@ call_decode(struct rpc_task *task) dprint_status(task); + if (!decode) { + task->tk_action = rpc_exit_task; + return; + } + if (task->tk_flags & RPC_CALL_MAJORSEEN) { if (clnt->cl_chatty) { printk(KERN_NOTICE "%s: server %s OK\n", @@ -2283,7 +2275,7 @@ call_decode(struct rpc_task *task) if (req->rq_rcv_buf.len < 12) { if (!RPC_IS_SOFT(task)) { - task->tk_action = call_bind; + task->tk_action = call_encode; goto out_retry; } dprintk("RPC: %s: too small RPC reply size (%d bytes)\n", @@ -2298,13 +2290,11 @@ call_decode(struct rpc_task *task) goto out_retry; return; } - task->tk_action = rpc_exit_task; - if (decode) { - task->tk_status = rpcauth_unwrap_resp(task, decode, req, p, - task->tk_msg.rpc_resp); - } + task->tk_status = rpcauth_unwrap_resp(task, decode, req, p, + task->tk_msg.rpc_resp); + dprintk("RPC: %5u call_decode result %d\n", task->tk_pid, task->tk_status); return; @@ -2416,7 +2406,7 @@ rpc_verify_header(struct rpc_task *task) task->tk_garb_retry--; dprintk("RPC: %5u %s: retry garbled creds\n", task->tk_pid, __func__); - task->tk_action = call_bind; + task->tk_action = call_encode; goto out_retry; case RPC_AUTH_TOOWEAK: printk(KERN_NOTICE "RPC: server %s requires stronger " @@ -2485,7 +2475,7 @@ out_garbage: task->tk_garb_retry--; dprintk("RPC: %5u %s: retrying\n", task->tk_pid, __func__); - task->tk_action = call_bind; + task->tk_action = call_encode; out_retry: return ERR_PTR(-EAGAIN); } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c @@ -99,65 +99,79 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task) list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list); } -static void rpc_rotate_queue_owner(struct rpc_wait_queue *queue) -{ - struct list_head *q = &queue->tasks[queue->priority]; - struct rpc_task *task; - - if (!list_empty(q)) { - task = list_first_entry(q, struct rpc_task, u.tk_wait.list); - if (task->tk_owner == queue->owner) - list_move_tail(&task->u.tk_wait.list, q); - } -} - static void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority) { if (queue->priority != priority) { - /* Fairness: rotate the list when changing priority */ - rpc_rotate_queue_owner(queue); queue->priority = priority; + queue->nr = 1U << priority; } } -static void rpc_set_waitqueue_owner(struct rpc_wait_queue *queue, pid_t pid) -{ - queue->owner = pid; - queue->nr = RPC_BATCH_COUNT; -} - static void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue) { rpc_set_waitqueue_priority(queue, queue->maxpriority); - rpc_set_waitqueue_owner(queue, 0); } /* - * Add new request to a priority queue. + * Add a request to a queue list */ -static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, - struct rpc_task *task, - unsigned char queue_priority) +static void +__rpc_list_enqueue_task(struct list_head *q, struct rpc_task *task) { - struct list_head *q; struct rpc_task *t; - INIT_LIST_HEAD(&task->u.tk_wait.links); - if (unlikely(queue_priority > queue->maxpriority)) - queue_priority = queue->maxpriority; - if (queue_priority > queue->priority) - rpc_set_waitqueue_priority(queue, queue_priority); - q = &queue->tasks[queue_priority]; list_for_each_entry(t, q, u.tk_wait.list) { if (t->tk_owner == task->tk_owner) { - list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links); + list_add_tail(&task->u.tk_wait.links, + &t->u.tk_wait.links); + /* Cache the queue head in task->u.tk_wait.list */ + task->u.tk_wait.list.next = q; + task->u.tk_wait.list.prev = NULL; return; } } + INIT_LIST_HEAD(&task->u.tk_wait.links); list_add_tail(&task->u.tk_wait.list, q); } /* + * Remove request from a queue list + */ +static void +__rpc_list_dequeue_task(struct rpc_task *task) +{ + struct list_head *q; + struct rpc_task *t; + + if (task->u.tk_wait.list.prev == NULL) { + list_del(&task->u.tk_wait.links); + return; + } + if (!list_empty(&task->u.tk_wait.links)) { + t = list_first_entry(&task->u.tk_wait.links, + struct rpc_task, + u.tk_wait.links); + /* Assume __rpc_list_enqueue_task() cached the queue head */ + q = t->u.tk_wait.list.next; + list_add_tail(&t->u.tk_wait.list, q); + list_del(&task->u.tk_wait.links); + } + list_del(&task->u.tk_wait.list); +} + +/* + * Add new request to a priority queue. + */ +static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, + struct rpc_task *task, + unsigned char queue_priority) +{ + if (unlikely(queue_priority > queue->maxpriority)) + queue_priority = queue->maxpriority; + __rpc_list_enqueue_task(&queue->tasks[queue_priority], task); +} + +/* * Add new request to wait queue. * * Swapper tasks always get inserted at the head of the queue. @@ -194,13 +208,7 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, */ static void __rpc_remove_wait_queue_priority(struct rpc_task *task) { - struct rpc_task *t; - - if (!list_empty(&task->u.tk_wait.links)) { - t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list); - list_move(&t->u.tk_wait.list, &task->u.tk_wait.list); - list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links); - } + __rpc_list_dequeue_task(task); } /* @@ -212,7 +220,8 @@ static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_tas __rpc_disable_timer(queue, task); if (RPC_IS_PRIORITY(queue)) __rpc_remove_wait_queue_priority(task); - list_del(&task->u.tk_wait.list); + else + list_del(&task->u.tk_wait.list); queue->qlen--; dprintk("RPC: %5u removed from queue %p \"%s\"\n", task->tk_pid, queue, rpc_qname(queue)); @@ -440,14 +449,28 @@ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq, /* * Wake up a queued task while the queue lock is being held */ -static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, - struct rpc_wait_queue *queue, struct rpc_task *task) +static struct rpc_task * +rpc_wake_up_task_on_wq_queue_action_locked(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, struct rpc_task *task, + bool (*action)(struct rpc_task *, void *), void *data) { if (RPC_IS_QUEUED(task)) { smp_rmb(); - if (task->tk_waitqueue == queue) - __rpc_do_wake_up_task_on_wq(wq, queue, task); + if (task->tk_waitqueue == queue) { + if (action == NULL || action(task, data)) { + __rpc_do_wake_up_task_on_wq(wq, queue, task); + return task; + } + } } + return NULL; +} + +static void +rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, struct rpc_task *task) +{ + rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, task, NULL, NULL); } /* @@ -465,6 +488,8 @@ void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq, struct rpc_wait_queue *queue, struct rpc_task *task) { + if (!RPC_IS_QUEUED(task)) + return; spin_lock_bh(&queue->lock); rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); spin_unlock_bh(&queue->lock); @@ -475,12 +500,48 @@ void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq, */ void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task) { + if (!RPC_IS_QUEUED(task)) + return; spin_lock_bh(&queue->lock); rpc_wake_up_task_queue_locked(queue, task); spin_unlock_bh(&queue->lock); } EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task); +static bool rpc_task_action_set_status(struct rpc_task *task, void *status) +{ + task->tk_status = *(int *)status; + return true; +} + +static void +rpc_wake_up_task_queue_set_status_locked(struct rpc_wait_queue *queue, + struct rpc_task *task, int status) +{ + rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue, + task, rpc_task_action_set_status, &status); +} + +/** + * rpc_wake_up_queued_task_set_status - wake up a task and set task->tk_status + * @queue: pointer to rpc_wait_queue + * @task: pointer to rpc_task + * @status: integer error value + * + * If @task is queued on @queue, then it is woken up, and @task->tk_status is + * set to the value of @status. + */ +void +rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *queue, + struct rpc_task *task, int status) +{ + if (!RPC_IS_QUEUED(task)) + return; + spin_lock_bh(&queue->lock); + rpc_wake_up_task_queue_set_status_locked(queue, task, status); + spin_unlock_bh(&queue->lock); +} + /* * Wake up the next task on a priority queue. */ @@ -493,17 +554,9 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q * Service a batch of tasks from a single owner. */ q = &queue->tasks[queue->priority]; - if (!list_empty(q)) { - task = list_entry(q->next, struct rpc_task, u.tk_wait.list); - if (queue->owner == task->tk_owner) { - if (--queue->nr) - goto out; - list_move_tail(&task->u.tk_wait.list, q); - } - /* - * Check if we need to switch queues. - */ - goto new_owner; + if (!list_empty(q) && --queue->nr) { + task = list_first_entry(q, struct rpc_task, u.tk_wait.list); + goto out; } /* @@ -515,7 +568,7 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q else q = q - 1; if (!list_empty(q)) { - task = list_entry(q->next, struct rpc_task, u.tk_wait.list); + task = list_first_entry(q, struct rpc_task, u.tk_wait.list); goto new_queue; } } while (q != &queue->tasks[queue->priority]); @@ -525,8 +578,6 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q new_queue: rpc_set_waitqueue_priority(queue, (unsigned int)(q - &queue->tasks[0])); -new_owner: - rpc_set_waitqueue_owner(queue, task->tk_owner); out: return task; } @@ -553,12 +604,9 @@ struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, queue, rpc_qname(queue)); spin_lock_bh(&queue->lock); task = __rpc_find_next_queued(queue); - if (task != NULL) { - if (func(task, data)) - rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); - else - task = NULL; - } + if (task != NULL) + task = rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, + task, func, data); spin_unlock_bh(&queue->lock); return task; diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c @@ -26,7 +26,8 @@ * Possibly called several times to iterate over an sk_buff and copy * data out of it. */ -size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) +static size_t +xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) { if (len > desc->count) len = desc->count; @@ -36,7 +37,6 @@ size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) desc->offset += len; return len; } -EXPORT_SYMBOL_GPL(xdr_skb_read_bits); /** * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer @@ -69,7 +69,8 @@ static size_t xdr_skb_read_and_csum_bits(struct xdr_skb_reader *desc, void *to, * @copy_actor: virtual method for copying data * */ -ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor) +static ssize_t +xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor) { struct page **ppage = xdr->pages; unsigned int len, pglen = xdr->page_len; @@ -104,7 +105,7 @@ ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct /* ACL likes to be lazy in allocating pages - ACLs * are small by default but can get huge. */ - if (unlikely(*ppage == NULL)) { + if ((xdr->flags & XDRBUF_SPARSE_PAGES) && *ppage == NULL) { *ppage = alloc_page(GFP_ATOMIC); if (unlikely(*ppage == NULL)) { if (copied == 0) @@ -140,7 +141,6 @@ copy_tail: out: return copied; } -EXPORT_SYMBOL_GPL(xdr_partial_copy_from_skb); /** * csum_partial_copy_to_xdr - checksum and copy data diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c @@ -171,7 +171,6 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl, mutex_init(&xprt->xpt_mutex); spin_lock_init(&xprt->xpt_lock); set_bit(XPT_BUSY, &xprt->xpt_flags); - rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending"); xprt->xpt_net = get_net(net); strcpy(xprt->xpt_remotebuf, "uninitialized"); } @@ -895,7 +894,6 @@ int svc_send(struct svc_rqst *rqstp) else len = xprt->xpt_ops->xpo_sendto(rqstp); mutex_unlock(&xprt->xpt_mutex); - rpc_wake_up(&xprt->xpt_bc_pending); trace_svc_send(rqstp, len); svc_xprt_release(rqstp); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c @@ -1004,7 +1004,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) if (!bc_xprt) return -EAGAIN; - spin_lock(&bc_xprt->recv_lock); + spin_lock(&bc_xprt->queue_lock); req = xprt_lookup_rqst(bc_xprt, xid); if (!req) goto unlock_notfound; @@ -1022,7 +1022,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) memcpy(dst->iov_base, src->iov_base, src->iov_len); xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len); rqstp->rq_arg.len = 0; - spin_unlock(&bc_xprt->recv_lock); + spin_unlock(&bc_xprt->queue_lock); return 0; unlock_notfound: printk(KERN_NOTICE @@ -1031,7 +1031,7 @@ unlock_notfound: __func__, ntohl(calldir), bc_xprt, ntohl(xid)); unlock_eagain: - spin_unlock(&bc_xprt->recv_lock); + spin_unlock(&bc_xprt->queue_lock); return -EAGAIN; } diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c @@ -15,6 +15,7 @@ #include <linux/errno.h> #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/msg_prot.h> +#include <linux/bvec.h> /* * XDR functions for basic NFS types @@ -128,6 +129,39 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len) } EXPORT_SYMBOL_GPL(xdr_terminate_string); +size_t +xdr_buf_pagecount(struct xdr_buf *buf) +{ + if (!buf->page_len) + return 0; + return (buf->page_base + buf->page_len + PAGE_SIZE - 1) >> PAGE_SHIFT; +} + +int +xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) +{ + size_t i, n = xdr_buf_pagecount(buf); + + if (n != 0 && buf->bvec == NULL) { + buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp); + if (!buf->bvec) + return -ENOMEM; + for (i = 0; i < n; i++) { + buf->bvec[i].bv_page = buf->pages[i]; + buf->bvec[i].bv_len = PAGE_SIZE; + buf->bvec[i].bv_offset = 0; + } + } + return 0; +} + +void +xdr_free_bvec(struct xdr_buf *buf) +{ + kfree(buf->bvec); + buf->bvec = NULL; +} + void xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, struct page **pages, unsigned int base, unsigned int len) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c @@ -68,8 +68,6 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net); static __be32 xprt_alloc_xid(struct rpc_xprt *xprt); static void xprt_connect_status(struct rpc_task *task); -static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); -static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *); static void xprt_destroy(struct rpc_xprt *xprt); static DEFINE_SPINLOCK(xprt_list_lock); @@ -171,6 +169,17 @@ out: } EXPORT_SYMBOL_GPL(xprt_load_transport); +static void xprt_clear_locked(struct rpc_xprt *xprt) +{ + xprt->snd_task = NULL; + if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { + smp_mb__before_atomic(); + clear_bit(XPRT_LOCKED, &xprt->state); + smp_mb__after_atomic(); + } else + queue_work(xprtiod_workqueue, &xprt->task_cleanup); +} + /** * xprt_reserve_xprt - serialize write access to transports * @task: task that is requesting access to the transport @@ -183,44 +192,53 @@ EXPORT_SYMBOL_GPL(xprt_load_transport); int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - int priority; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) return 1; goto out_sleep; } + if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) + goto out_unlock; xprt->snd_task = task; - if (req != NULL) - req->rq_ntrans++; return 1; +out_unlock: + xprt_clear_locked(xprt); out_sleep: dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); - task->tk_timeout = 0; + task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; task->tk_status = -EAGAIN; - if (req == NULL) - priority = RPC_PRIORITY_LOW; - else if (!req->rq_ntrans) - priority = RPC_PRIORITY_NORMAL; - else - priority = RPC_PRIORITY_HIGH; - rpc_sleep_on_priority(&xprt->sending, task, NULL, priority); + rpc_sleep_on(&xprt->sending, task, NULL); return 0; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt); -static void xprt_clear_locked(struct rpc_xprt *xprt) +static bool +xprt_need_congestion_window_wait(struct rpc_xprt *xprt) { - xprt->snd_task = NULL; - if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { - smp_mb__before_atomic(); - clear_bit(XPRT_LOCKED, &xprt->state); - smp_mb__after_atomic(); - } else - queue_work(xprtiod_workqueue, &xprt->task_cleanup); + return test_bit(XPRT_CWND_WAIT, &xprt->state); +} + +static void +xprt_set_congestion_window_wait(struct rpc_xprt *xprt) +{ + if (!list_empty(&xprt->xmit_queue)) { + /* Peek at head of queue to see if it can make progress */ + if (list_first_entry(&xprt->xmit_queue, struct rpc_rqst, + rq_xmit)->rq_cong) + return; + } + set_bit(XPRT_CWND_WAIT, &xprt->state); +} + +static void +xprt_test_and_clear_congestion_window_wait(struct rpc_xprt *xprt) +{ + if (!RPCXPRT_CONGESTED(xprt)) + clear_bit(XPRT_CWND_WAIT, &xprt->state); } /* @@ -230,11 +248,11 @@ static void xprt_clear_locked(struct rpc_xprt *xprt) * Same as xprt_reserve_xprt, but Van Jacobson congestion control is * integrated into the decision of whether a request is allowed to be * woken up and given access to the transport. + * Note that the lock is only granted if we know there are free slots. */ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - int priority; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) @@ -245,25 +263,19 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) xprt->snd_task = task; return 1; } - if (__xprt_get_cong(xprt, task)) { + if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) + goto out_unlock; + if (!xprt_need_congestion_window_wait(xprt)) { xprt->snd_task = task; - req->rq_ntrans++; return 1; } +out_unlock: xprt_clear_locked(xprt); out_sleep: - if (req) - __xprt_put_cong(xprt, req); dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); - task->tk_timeout = 0; + task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; task->tk_status = -EAGAIN; - if (req == NULL) - priority = RPC_PRIORITY_LOW; - else if (!req->rq_ntrans) - priority = RPC_PRIORITY_NORMAL; - else - priority = RPC_PRIORITY_HIGH; - rpc_sleep_on_priority(&xprt->sending, task, NULL, priority); + rpc_sleep_on(&xprt->sending, task, NULL); return 0; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong); @@ -272,6 +284,8 @@ static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { int retval; + if (test_bit(XPRT_LOCKED, &xprt->state) && xprt->snd_task == task) + return 1; spin_lock_bh(&xprt->transport_lock); retval = xprt->ops->reserve_xprt(xprt, task); spin_unlock_bh(&xprt->transport_lock); @@ -281,12 +295,8 @@ static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) static bool __xprt_lock_write_func(struct rpc_task *task, void *data) { struct rpc_xprt *xprt = data; - struct rpc_rqst *req; - req = task->tk_rqstp; xprt->snd_task = task; - if (req) - req->rq_ntrans++; return true; } @@ -294,53 +304,30 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt) { if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; - + if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) + goto out_unlock; if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, __xprt_lock_write_func, xprt)) return; +out_unlock: xprt_clear_locked(xprt); } -static bool __xprt_lock_write_cong_func(struct rpc_task *task, void *data) -{ - struct rpc_xprt *xprt = data; - struct rpc_rqst *req; - - req = task->tk_rqstp; - if (req == NULL) { - xprt->snd_task = task; - return true; - } - if (__xprt_get_cong(xprt, task)) { - xprt->snd_task = task; - req->rq_ntrans++; - return true; - } - return false; -} - static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) { if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; - if (RPCXPRT_CONGESTED(xprt)) + if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) + goto out_unlock; + if (xprt_need_congestion_window_wait(xprt)) goto out_unlock; if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, - __xprt_lock_write_cong_func, xprt)) + __xprt_lock_write_func, xprt)) return; out_unlock: xprt_clear_locked(xprt); } -static void xprt_task_clear_bytes_sent(struct rpc_task *task) -{ - if (task != NULL) { - struct rpc_rqst *req = task->tk_rqstp; - if (req != NULL) - req->rq_bytes_sent = 0; - } -} - /** * xprt_release_xprt - allow other requests to use a transport * @xprt: transport with other tasks potentially waiting @@ -351,7 +338,6 @@ static void xprt_task_clear_bytes_sent(struct rpc_task *task) void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next(xprt); } @@ -369,7 +355,6 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt); void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next_cong(xprt); } @@ -378,6 +363,8 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt_cong); static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) { + if (xprt->snd_task != task) + return; spin_lock_bh(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); spin_unlock_bh(&xprt->transport_lock); @@ -388,16 +375,16 @@ static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *ta * overflowed. Put the task to sleep if this is the case. */ static int -__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_task *task) +__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; - if (req->rq_cong) return 1; dprintk("RPC: %5u xprt_cwnd_limited cong = %lu cwnd = %lu\n", - task->tk_pid, xprt->cong, xprt->cwnd); - if (RPCXPRT_CONGESTED(xprt)) + req->rq_task->tk_pid, xprt->cong, xprt->cwnd); + if (RPCXPRT_CONGESTED(xprt)) { + xprt_set_congestion_window_wait(xprt); return 0; + } req->rq_cong = 1; xprt->cong += RPC_CWNDSCALE; return 1; @@ -414,10 +401,32 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) return; req->rq_cong = 0; xprt->cong -= RPC_CWNDSCALE; + xprt_test_and_clear_congestion_window_wait(xprt); __xprt_lock_write_next_cong(xprt); } /** + * xprt_request_get_cong - Request congestion control credits + * @xprt: pointer to transport + * @req: pointer to RPC request + * + * Useful for transports that require congestion control. + */ +bool +xprt_request_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) +{ + bool ret = false; + + if (req->rq_cong) + return true; + spin_lock_bh(&xprt->transport_lock); + ret = __xprt_get_cong(xprt, req) != 0; + spin_unlock_bh(&xprt->transport_lock); + return ret; +} +EXPORT_SYMBOL_GPL(xprt_request_get_cong); + +/** * xprt_release_rqst_cong - housekeeping when request is complete * @task: RPC request that recently completed * @@ -431,6 +440,20 @@ void xprt_release_rqst_cong(struct rpc_task *task) } EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); +/* + * Clear the congestion window wait flag and wake up the next + * entry on xprt->sending + */ +static void +xprt_clear_congestion_window_wait(struct rpc_xprt *xprt) +{ + if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state)) { + spin_lock_bh(&xprt->transport_lock); + __xprt_lock_write_next_cong(xprt); + spin_unlock_bh(&xprt->transport_lock); + } +} + /** * xprt_adjust_cwnd - adjust transport congestion window * @xprt: pointer to xprt @@ -488,39 +511,46 @@ EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks); /** * xprt_wait_for_buffer_space - wait for transport output buffer to clear - * @task: task to be put to sleep - * @action: function pointer to be executed after wait + * @xprt: transport * * Note that we only set the timer for the case of RPC_IS_SOFT(), since * we don't in general want to force a socket disconnection due to * an incomplete RPC call transmission. */ -void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action) +void xprt_wait_for_buffer_space(struct rpc_xprt *xprt) { - struct rpc_rqst *req = task->tk_rqstp; - struct rpc_xprt *xprt = req->rq_xprt; - - task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; - rpc_sleep_on(&xprt->pending, task, action); + set_bit(XPRT_WRITE_SPACE, &xprt->state); } EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space); +static bool +xprt_clear_write_space_locked(struct rpc_xprt *xprt) +{ + if (test_and_clear_bit(XPRT_WRITE_SPACE, &xprt->state)) { + __xprt_lock_write_next(xprt); + dprintk("RPC: write space: waking waiting task on " + "xprt %p\n", xprt); + return true; + } + return false; +} + /** * xprt_write_space - wake the task waiting for transport output buffer space * @xprt: transport with waiting tasks * * Can be called in a soft IRQ context, so xprt_write_space never sleeps. */ -void xprt_write_space(struct rpc_xprt *xprt) +bool xprt_write_space(struct rpc_xprt *xprt) { + bool ret; + + if (!test_bit(XPRT_WRITE_SPACE, &xprt->state)) + return false; spin_lock_bh(&xprt->transport_lock); - if (xprt->snd_task) { - dprintk("RPC: write space: waking waiting task on " - "xprt %p\n", xprt); - rpc_wake_up_queued_task_on_wq(xprtiod_workqueue, - &xprt->pending, xprt->snd_task); - } + ret = xprt_clear_write_space_locked(xprt); spin_unlock_bh(&xprt->transport_lock); + return ret; } EXPORT_SYMBOL_GPL(xprt_write_space); @@ -631,6 +661,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt) dprintk("RPC: disconnected transport %p\n", xprt); spin_lock_bh(&xprt->transport_lock); xprt_clear_connected(xprt); + xprt_clear_write_space_locked(xprt); xprt_wake_pending_tasks(xprt, -EAGAIN); spin_unlock_bh(&xprt->transport_lock); } @@ -654,6 +685,22 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) } EXPORT_SYMBOL_GPL(xprt_force_disconnect); +static unsigned int +xprt_connect_cookie(struct rpc_xprt *xprt) +{ + return READ_ONCE(xprt->connect_cookie); +} + +static bool +xprt_request_retransmit_after_disconnect(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + return req->rq_connect_cookie != xprt_connect_cookie(xprt) || + !xprt_connected(xprt); +} + /** * xprt_conditional_disconnect - force a transport to disconnect * @xprt: transport to disconnect @@ -692,7 +739,7 @@ static void xprt_schedule_autodisconnect(struct rpc_xprt *xprt) __must_hold(&xprt->transport_lock) { - if (list_empty(&xprt->recv) && xprt_has_timer(xprt)) + if (RB_EMPTY_ROOT(&xprt->recv_queue) && xprt_has_timer(xprt)) mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout); } @@ -702,7 +749,7 @@ xprt_init_autodisconnect(struct timer_list *t) struct rpc_xprt *xprt = from_timer(xprt, t, timer); spin_lock(&xprt->transport_lock); - if (!list_empty(&xprt->recv)) + if (!RB_EMPTY_ROOT(&xprt->recv_queue)) goto out_abort; /* Reset xprt->last_used to avoid connect/autodisconnect cycling */ xprt->last_used = jiffies; @@ -726,7 +773,6 @@ bool xprt_lock_connect(struct rpc_xprt *xprt, goto out; if (xprt->snd_task != task) goto out; - xprt_task_clear_bytes_sent(task); xprt->snd_task = cookie; ret = true; out: @@ -772,7 +818,6 @@ void xprt_connect(struct rpc_task *task) xprt->ops->close(xprt); if (!xprt_connected(xprt)) { - task->tk_rqstp->rq_bytes_sent = 0; task->tk_timeout = task->tk_rqstp->rq_timeout; task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie; rpc_sleep_on(&xprt->pending, task, xprt_connect_status); @@ -789,17 +834,11 @@ void xprt_connect(struct rpc_task *task) static void xprt_connect_status(struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; - - if (task->tk_status == 0) { - xprt->stat.connect_count++; - xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; + switch (task->tk_status) { + case 0: dprintk("RPC: %5u xprt_connect_status: connection established\n", task->tk_pid); - return; - } - - switch (task->tk_status) { + break; case -ECONNREFUSED: case -ECONNRESET: case -ECONNABORTED: @@ -816,28 +855,97 @@ static void xprt_connect_status(struct rpc_task *task) default: dprintk("RPC: %5u xprt_connect_status: error %d connecting to " "server %s\n", task->tk_pid, -task->tk_status, - xprt->servername); + task->tk_rqstp->rq_xprt->servername); task->tk_status = -EIO; } } +enum xprt_xid_rb_cmp { + XID_RB_EQUAL, + XID_RB_LEFT, + XID_RB_RIGHT, +}; +static enum xprt_xid_rb_cmp +xprt_xid_cmp(__be32 xid1, __be32 xid2) +{ + if (xid1 == xid2) + return XID_RB_EQUAL; + if ((__force u32)xid1 < (__force u32)xid2) + return XID_RB_LEFT; + return XID_RB_RIGHT; +} + +static struct rpc_rqst * +xprt_request_rb_find(struct rpc_xprt *xprt, __be32 xid) +{ + struct rb_node *n = xprt->recv_queue.rb_node; + struct rpc_rqst *req; + + while (n != NULL) { + req = rb_entry(n, struct rpc_rqst, rq_recv); + switch (xprt_xid_cmp(xid, req->rq_xid)) { + case XID_RB_LEFT: + n = n->rb_left; + break; + case XID_RB_RIGHT: + n = n->rb_right; + break; + case XID_RB_EQUAL: + return req; + } + } + return NULL; +} + +static void +xprt_request_rb_insert(struct rpc_xprt *xprt, struct rpc_rqst *new) +{ + struct rb_node **p = &xprt->recv_queue.rb_node; + struct rb_node *n = NULL; + struct rpc_rqst *req; + + while (*p != NULL) { + n = *p; + req = rb_entry(n, struct rpc_rqst, rq_recv); + switch(xprt_xid_cmp(new->rq_xid, req->rq_xid)) { + case XID_RB_LEFT: + p = &n->rb_left; + break; + case XID_RB_RIGHT: + p = &n->rb_right; + break; + case XID_RB_EQUAL: + WARN_ON_ONCE(new != req); + return; + } + } + rb_link_node(&new->rq_recv, n, p); + rb_insert_color(&new->rq_recv, &xprt->recv_queue); +} + +static void +xprt_request_rb_remove(struct rpc_xprt *xprt, struct rpc_rqst *req) +{ + rb_erase(&req->rq_recv, &xprt->recv_queue); +} + /** * xprt_lookup_rqst - find an RPC request corresponding to an XID * @xprt: transport on which the original request was transmitted * @xid: RPC XID of incoming reply * - * Caller holds xprt->recv_lock. + * Caller holds xprt->queue_lock. */ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) { struct rpc_rqst *entry; - list_for_each_entry(entry, &xprt->recv, rq_list) - if (entry->rq_xid == xid) { - trace_xprt_lookup_rqst(xprt, xid, 0); - entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime); - return entry; - } + entry = xprt_request_rb_find(xprt, xid); + if (entry != NULL) { + trace_xprt_lookup_rqst(xprt, xid, 0); + entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime); + return entry; + } dprintk("RPC: xprt_lookup_rqst did not find xid %08x\n", ntohl(xid)); @@ -847,16 +955,22 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) } EXPORT_SYMBOL_GPL(xprt_lookup_rqst); +static bool +xprt_is_pinned_rqst(struct rpc_rqst *req) +{ + return atomic_read(&req->rq_pin) != 0; +} + /** * xprt_pin_rqst - Pin a request on the transport receive list * @req: Request to pin * * Caller must ensure this is atomic with the call to xprt_lookup_rqst() - * so should be holding the xprt transport lock. + * so should be holding the xprt receive lock. */ void xprt_pin_rqst(struct rpc_rqst *req) { - set_bit(RPC_TASK_MSG_RECV, &req->rq_task->tk_runstate); + atomic_inc(&req->rq_pin); } EXPORT_SYMBOL_GPL(xprt_pin_rqst); @@ -864,38 +978,87 @@ EXPORT_SYMBOL_GPL(xprt_pin_rqst); * xprt_unpin_rqst - Unpin a request on the transport receive list * @req: Request to pin * - * Caller should be holding the xprt transport lock. + * Caller should be holding the xprt receive lock. */ void xprt_unpin_rqst(struct rpc_rqst *req) { - struct rpc_task *task = req->rq_task; - - clear_bit(RPC_TASK_MSG_RECV, &task->tk_runstate); - if (test_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate)) - wake_up_bit(&task->tk_runstate, RPC_TASK_MSG_RECV); + if (!test_bit(RPC_TASK_MSG_PIN_WAIT, &req->rq_task->tk_runstate)) { + atomic_dec(&req->rq_pin); + return; + } + if (atomic_dec_and_test(&req->rq_pin)) + wake_up_var(&req->rq_pin); } EXPORT_SYMBOL_GPL(xprt_unpin_rqst); static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req) -__must_hold(&req->rq_xprt->recv_lock) { - struct rpc_task *task = req->rq_task; + wait_var_event(&req->rq_pin, !xprt_is_pinned_rqst(req)); +} - if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) { - spin_unlock(&req->rq_xprt->recv_lock); - set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); - wait_on_bit(&task->tk_runstate, RPC_TASK_MSG_RECV, - TASK_UNINTERRUPTIBLE); - clear_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); - spin_lock(&req->rq_xprt->recv_lock); - } +static bool +xprt_request_data_received(struct rpc_task *task) +{ + return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) && + READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) != 0; +} + +static bool +xprt_request_need_enqueue_receive(struct rpc_task *task, struct rpc_rqst *req) +{ + return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) && + READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) == 0; +} + +/** + * xprt_request_enqueue_receive - Add an request to the receive queue + * @task: RPC task + * + */ +void +xprt_request_enqueue_receive(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (!xprt_request_need_enqueue_receive(task, req)) + return; + spin_lock(&xprt->queue_lock); + + /* Update the softirq receive buffer */ + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, + sizeof(req->rq_private_buf)); + + /* Add request to the receive list */ + xprt_request_rb_insert(xprt, req); + set_bit(RPC_TASK_NEED_RECV, &task->tk_runstate); + spin_unlock(&xprt->queue_lock); + + xprt_reset_majortimeo(req); + /* Turn off autodisconnect */ + del_singleshot_timer_sync(&xprt->timer); +} + +/** + * xprt_request_dequeue_receive_locked - Remove a request from the receive queue + * @task: RPC task + * + * Caller must hold xprt->queue_lock. + */ +static void +xprt_request_dequeue_receive_locked(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + + if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) + xprt_request_rb_remove(req->rq_xprt, req); } /** * xprt_update_rtt - Update RPC RTT statistics * @task: RPC request that recently completed * - * Caller holds xprt->recv_lock. + * Caller holds xprt->queue_lock. */ void xprt_update_rtt(struct rpc_task *task) { @@ -917,7 +1080,7 @@ EXPORT_SYMBOL_GPL(xprt_update_rtt); * @task: RPC request that recently completed * @copied: actual number of bytes received from the transport * - * Caller holds xprt->recv_lock. + * Caller holds xprt->queue_lock. */ void xprt_complete_rqst(struct rpc_task *task, int copied) { @@ -930,12 +1093,12 @@ void xprt_complete_rqst(struct rpc_task *task, int copied) xprt->stat.recvs++; - list_del_init(&req->rq_list); req->rq_private_buf.len = copied; /* Ensure all writes are done before we update */ /* req->rq_reply_bytes_recvd */ smp_wmb(); req->rq_reply_bytes_recvd = copied; + xprt_request_dequeue_receive_locked(task); rpc_wake_up_queued_task(&xprt->pending, task); } EXPORT_SYMBOL_GPL(xprt_complete_rqst); @@ -957,6 +1120,172 @@ static void xprt_timer(struct rpc_task *task) } /** + * xprt_request_wait_receive - wait for the reply to an RPC request + * @task: RPC task about to send a request + * + */ +void xprt_request_wait_receive(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (!test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) + return; + /* + * Sleep on the pending queue if we're expecting a reply. + * The spinlock ensures atomicity between the test of + * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on(). + */ + spin_lock(&xprt->queue_lock); + if (test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) { + xprt->ops->set_retrans_timeout(task); + rpc_sleep_on(&xprt->pending, task, xprt_timer); + /* + * Send an extra queue wakeup call if the + * connection was dropped in case the call to + * rpc_sleep_on() raced. + */ + if (xprt_request_retransmit_after_disconnect(task)) + rpc_wake_up_queued_task_set_status(&xprt->pending, + task, -ENOTCONN); + } + spin_unlock(&xprt->queue_lock); +} + +static bool +xprt_request_need_enqueue_transmit(struct rpc_task *task, struct rpc_rqst *req) +{ + return !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate); +} + +/** + * xprt_request_enqueue_transmit - queue a task for transmission + * @task: pointer to rpc_task + * + * Add a task to the transmission queue. + */ +void +xprt_request_enqueue_transmit(struct rpc_task *task) +{ + struct rpc_rqst *pos, *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (xprt_request_need_enqueue_transmit(task, req)) { + spin_lock(&xprt->queue_lock); + /* + * Requests that carry congestion control credits are added + * to the head of the list to avoid starvation issues. + */ + if (req->rq_cong) { + xprt_clear_congestion_window_wait(xprt); + list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { + if (pos->rq_cong) + continue; + /* Note: req is added _before_ pos */ + list_add_tail(&req->rq_xmit, &pos->rq_xmit); + INIT_LIST_HEAD(&req->rq_xmit2); + goto out; + } + } else if (RPC_IS_SWAPPER(task)) { + list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { + if (pos->rq_cong || pos->rq_bytes_sent) + continue; + if (RPC_IS_SWAPPER(pos->rq_task)) + continue; + /* Note: req is added _before_ pos */ + list_add_tail(&req->rq_xmit, &pos->rq_xmit); + INIT_LIST_HEAD(&req->rq_xmit2); + goto out; + } + } else { + list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { + if (pos->rq_task->tk_owner != task->tk_owner) + continue; + list_add_tail(&req->rq_xmit2, &pos->rq_xmit2); + INIT_LIST_HEAD(&req->rq_xmit); + goto out; + } + } + list_add_tail(&req->rq_xmit, &xprt->xmit_queue); + INIT_LIST_HEAD(&req->rq_xmit2); +out: + set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate); + spin_unlock(&xprt->queue_lock); + } +} + +/** + * xprt_request_dequeue_transmit_locked - remove a task from the transmission queue + * @task: pointer to rpc_task + * + * Remove a task from the transmission queue + * Caller must hold xprt->queue_lock + */ +static void +xprt_request_dequeue_transmit_locked(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + + if (!test_and_clear_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + return; + if (!list_empty(&req->rq_xmit)) { + list_del(&req->rq_xmit); + if (!list_empty(&req->rq_xmit2)) { + struct rpc_rqst *next = list_first_entry(&req->rq_xmit2, + struct rpc_rqst, rq_xmit2); + list_del(&req->rq_xmit2); + list_add_tail(&next->rq_xmit, &next->rq_xprt->xmit_queue); + } + } else + list_del(&req->rq_xmit2); +} + +/** + * xprt_request_dequeue_transmit - remove a task from the transmission queue + * @task: pointer to rpc_task + * + * Remove a task from the transmission queue + */ +static void +xprt_request_dequeue_transmit(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + spin_lock(&xprt->queue_lock); + xprt_request_dequeue_transmit_locked(task); + spin_unlock(&xprt->queue_lock); +} + +/** + * xprt_request_prepare - prepare an encoded request for transport + * @req: pointer to rpc_rqst + * + * Calls into the transport layer to do whatever is needed to prepare + * the request for transmission or receive. + */ +void +xprt_request_prepare(struct rpc_rqst *req) +{ + struct rpc_xprt *xprt = req->rq_xprt; + + if (xprt->ops->prepare_request) + xprt->ops->prepare_request(req); +} + +/** + * xprt_request_need_retransmit - Test if a task needs retransmission + * @task: pointer to rpc_task + * + * Test for whether a connection breakage requires the task to retransmit + */ +bool +xprt_request_need_retransmit(struct rpc_task *task) +{ + return xprt_request_retransmit_after_disconnect(task); +} + +/** * xprt_prepare_transmit - reserve the transport before sending a request * @task: RPC task about to send a request * @@ -965,32 +1294,18 @@ bool xprt_prepare_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - bool ret = false; dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid); - spin_lock_bh(&xprt->transport_lock); - if (!req->rq_bytes_sent) { - if (req->rq_reply_bytes_recvd) { - task->tk_status = req->rq_reply_bytes_recvd; - goto out_unlock; - } - if ((task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) - && xprt_connected(xprt) - && req->rq_connect_cookie == xprt->connect_cookie) { - xprt->ops->set_retrans_timeout(task); - rpc_sleep_on(&xprt->pending, task, xprt_timer); - goto out_unlock; - } - } - if (!xprt->ops->reserve_xprt(xprt, task)) { - task->tk_status = -EAGAIN; - goto out_unlock; + if (!xprt_lock_write(xprt, task)) { + /* Race breaker: someone may have transmitted us */ + if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + rpc_wake_up_queued_task_set_status(&xprt->sending, + task, 0); + return false; + } - ret = true; -out_unlock: - spin_unlock_bh(&xprt->transport_lock); - return ret; + return true; } void xprt_end_transmit(struct rpc_task *task) @@ -999,54 +1314,62 @@ void xprt_end_transmit(struct rpc_task *task) } /** - * xprt_transmit - send an RPC request on a transport - * @task: controlling RPC task + * xprt_request_transmit - send an RPC request on a transport + * @req: pointer to request to transmit + * @snd_task: RPC task that owns the transport lock * - * We have to copy the iovec because sendmsg fiddles with its contents. + * This performs the transmission of a single request. + * Note that if the request is not the same as snd_task, then it + * does need to be pinned. + * Returns '0' on success. */ -void xprt_transmit(struct rpc_task *task) +static int +xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) { - struct rpc_rqst *req = task->tk_rqstp; - struct rpc_xprt *xprt = req->rq_xprt; + struct rpc_xprt *xprt = req->rq_xprt; + struct rpc_task *task = req->rq_task; unsigned int connect_cookie; + int is_retrans = RPC_WAS_SENT(task); int status; dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); - if (!req->rq_reply_bytes_recvd) { - if (list_empty(&req->rq_list) && rpc_reply_expected(task)) { - /* - * Add to the list only if we're expecting a reply - */ - /* Update the softirq receive buffer */ - memcpy(&req->rq_private_buf, &req->rq_rcv_buf, - sizeof(req->rq_private_buf)); - /* Add request to the receive list */ - spin_lock(&xprt->recv_lock); - list_add_tail(&req->rq_list, &xprt->recv); - spin_unlock(&xprt->recv_lock); - xprt_reset_majortimeo(req); - /* Turn off autodisconnect */ - del_singleshot_timer_sync(&xprt->timer); + if (!req->rq_bytes_sent) { + if (xprt_request_data_received(task)) { + status = 0; + goto out_dequeue; } - } else if (!req->rq_bytes_sent) - return; + /* Verify that our message lies in the RPCSEC_GSS window */ + if (rpcauth_xmit_need_reencode(task)) { + status = -EBADMSG; + goto out_dequeue; + } + } + + /* + * Update req->rq_ntrans before transmitting to avoid races with + * xprt_update_rtt(), which needs to know that it is recording a + * reply to the first transmission. + */ + req->rq_ntrans++; connect_cookie = xprt->connect_cookie; - status = xprt->ops->send_request(task); + status = xprt->ops->send_request(req); trace_xprt_transmit(xprt, req->rq_xid, status); if (status != 0) { - task->tk_status = status; - return; + req->rq_ntrans--; + return status; } + + if (is_retrans) + task->tk_client->cl_stats->rpcretrans++; + xprt_inject_disconnect(xprt); dprintk("RPC: %5u xmit complete\n", task->tk_pid); task->tk_flags |= RPC_TASK_SENT; spin_lock_bh(&xprt->transport_lock); - xprt->ops->set_retrans_timeout(task); - xprt->stat.sends++; xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; xprt->stat.bklog_u += xprt->backlog.qlen; @@ -1055,25 +1378,49 @@ void xprt_transmit(struct rpc_task *task) spin_unlock_bh(&xprt->transport_lock); req->rq_connect_cookie = connect_cookie; - if (rpc_reply_expected(task) && !READ_ONCE(req->rq_reply_bytes_recvd)) { - /* - * Sleep on the pending queue if we're expecting a reply. - * The spinlock ensures atomicity between the test of - * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on(). - */ - spin_lock(&xprt->recv_lock); - if (!req->rq_reply_bytes_recvd) { - rpc_sleep_on(&xprt->pending, task, xprt_timer); - /* - * Send an extra queue wakeup call if the - * connection was dropped in case the call to - * rpc_sleep_on() raced. - */ - if (!xprt_connected(xprt)) - xprt_wake_pending_tasks(xprt, -ENOTCONN); - } - spin_unlock(&xprt->recv_lock); +out_dequeue: + xprt_request_dequeue_transmit(task); + rpc_wake_up_queued_task_set_status(&xprt->sending, task, status); + return status; +} + +/** + * xprt_transmit - send an RPC request on a transport + * @task: controlling RPC task + * + * Attempts to drain the transmit queue. On exit, either the transport + * signalled an error that needs to be handled before transmission can + * resume, or @task finished transmitting, and detected that it already + * received a reply. + */ +void +xprt_transmit(struct rpc_task *task) +{ + struct rpc_rqst *next, *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + int status; + + spin_lock(&xprt->queue_lock); + while (!list_empty(&xprt->xmit_queue)) { + next = list_first_entry(&xprt->xmit_queue, + struct rpc_rqst, rq_xmit); + xprt_pin_rqst(next); + spin_unlock(&xprt->queue_lock); + status = xprt_request_transmit(next, task); + if (status == -EBADMSG && next != req) + status = 0; + cond_resched(); + spin_lock(&xprt->queue_lock); + xprt_unpin_rqst(next); + if (status == 0) { + if (!xprt_request_data_received(task) || + test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + continue; + } else if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + task->tk_status = status; + break; } + spin_unlock(&xprt->queue_lock); } static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task) @@ -1170,20 +1517,6 @@ out_init_req: } EXPORT_SYMBOL_GPL(xprt_alloc_slot); -void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) -{ - /* Note: grabbing the xprt_lock_write() ensures that we throttle - * new slot allocation if the transport is congested (i.e. when - * reconnecting a stream transport or when out of socket write - * buffer space). - */ - if (xprt_lock_write(xprt, task)) { - xprt_alloc_slot(xprt, task); - xprt_release_write(xprt, task); - } -} -EXPORT_SYMBOL_GPL(xprt_lock_and_alloc_slot); - void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { spin_lock(&xprt->reserve_lock); @@ -1250,6 +1583,60 @@ void xprt_free(struct rpc_xprt *xprt) } EXPORT_SYMBOL_GPL(xprt_free); +static void +xprt_init_connect_cookie(struct rpc_rqst *req, struct rpc_xprt *xprt) +{ + req->rq_connect_cookie = xprt_connect_cookie(xprt) - 1; +} + +static __be32 +xprt_alloc_xid(struct rpc_xprt *xprt) +{ + __be32 xid; + + spin_lock(&xprt->reserve_lock); + xid = (__force __be32)xprt->xid++; + spin_unlock(&xprt->reserve_lock); + return xid; +} + +static void +xprt_init_xid(struct rpc_xprt *xprt) +{ + xprt->xid = prandom_u32(); +} + +static void +xprt_request_init(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_rqst *req = task->tk_rqstp; + + req->rq_timeout = task->tk_client->cl_timeout->to_initval; + req->rq_task = task; + req->rq_xprt = xprt; + req->rq_buffer = NULL; + req->rq_xid = xprt_alloc_xid(xprt); + xprt_init_connect_cookie(req, xprt); + req->rq_bytes_sent = 0; + req->rq_snd_buf.len = 0; + req->rq_snd_buf.buflen = 0; + req->rq_rcv_buf.len = 0; + req->rq_rcv_buf.buflen = 0; + req->rq_release_snd_buf = NULL; + xprt_reset_majortimeo(req); + dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, + req, ntohl(req->rq_xid)); +} + +static void +xprt_do_reserve(struct rpc_xprt *xprt, struct rpc_task *task) +{ + xprt->ops->alloc_slot(xprt, task); + if (task->tk_rqstp != NULL) + xprt_request_init(task); +} + /** * xprt_reserve - allocate an RPC request slot * @task: RPC task requesting a slot allocation @@ -1269,7 +1656,7 @@ void xprt_reserve(struct rpc_task *task) task->tk_timeout = 0; task->tk_status = -EAGAIN; if (!xprt_throttle_congested(xprt, task)) - xprt->ops->alloc_slot(xprt, task); + xprt_do_reserve(xprt, task); } /** @@ -1291,45 +1678,29 @@ void xprt_retry_reserve(struct rpc_task *task) task->tk_timeout = 0; task->tk_status = -EAGAIN; - xprt->ops->alloc_slot(xprt, task); -} - -static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt) -{ - __be32 xid; - - spin_lock(&xprt->reserve_lock); - xid = (__force __be32)xprt->xid++; - spin_unlock(&xprt->reserve_lock); - return xid; + xprt_do_reserve(xprt, task); } -static inline void xprt_init_xid(struct rpc_xprt *xprt) -{ - xprt->xid = prandom_u32(); -} - -void xprt_request_init(struct rpc_task *task) +static void +xprt_request_dequeue_all(struct rpc_task *task, struct rpc_rqst *req) { - struct rpc_xprt *xprt = task->tk_xprt; - struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; - INIT_LIST_HEAD(&req->rq_list); - req->rq_timeout = task->tk_client->cl_timeout->to_initval; - req->rq_task = task; - req->rq_xprt = xprt; - req->rq_buffer = NULL; - req->rq_xid = xprt_alloc_xid(xprt); - req->rq_connect_cookie = xprt->connect_cookie - 1; - req->rq_bytes_sent = 0; - req->rq_snd_buf.len = 0; - req->rq_snd_buf.buflen = 0; - req->rq_rcv_buf.len = 0; - req->rq_rcv_buf.buflen = 0; - req->rq_release_snd_buf = NULL; - xprt_reset_majortimeo(req); - dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, - req, ntohl(req->rq_xid)); + if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) || + test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) || + xprt_is_pinned_rqst(req)) { + spin_lock(&xprt->queue_lock); + xprt_request_dequeue_transmit_locked(task); + xprt_request_dequeue_receive_locked(task); + while (xprt_is_pinned_rqst(req)) { + set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); + spin_unlock(&xprt->queue_lock); + xprt_wait_on_pinned_rqst(req); + spin_lock(&xprt->queue_lock); + clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); + } + spin_unlock(&xprt->queue_lock); + } } /** @@ -1345,8 +1716,7 @@ void xprt_release(struct rpc_task *task) if (req == NULL) { if (task->tk_client) { xprt = task->tk_xprt; - if (xprt->snd_task == task) - xprt_release_write(xprt, task); + xprt_release_write(xprt, task); } return; } @@ -1356,12 +1726,7 @@ void xprt_release(struct rpc_task *task) task->tk_ops->rpc_count_stats(task, task->tk_calldata); else if (task->tk_client) rpc_count_iostats(task, task->tk_client->cl_metrics); - spin_lock(&xprt->recv_lock); - if (!list_empty(&req->rq_list)) { - list_del_init(&req->rq_list); - xprt_wait_on_pinned_rqst(req); - } - spin_unlock(&xprt->recv_lock); + xprt_request_dequeue_all(task, req); spin_lock_bh(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); if (xprt->ops->release_request) @@ -1372,6 +1737,7 @@ void xprt_release(struct rpc_task *task) if (req->rq_buffer) xprt->ops->buf_free(task); xprt_inject_disconnect(xprt); + xdr_free_bvec(&req->rq_rcv_buf); if (req->rq_cred != NULL) put_rpccred(req->rq_cred); task->tk_rqstp = NULL; @@ -1385,16 +1751,36 @@ void xprt_release(struct rpc_task *task) xprt_free_bc_request(req); } +#ifdef CONFIG_SUNRPC_BACKCHANNEL +void +xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task) +{ + struct xdr_buf *xbufp = &req->rq_snd_buf; + + task->tk_rqstp = req; + req->rq_task = task; + xprt_init_connect_cookie(req, req->rq_xprt); + /* + * Set up the xdr_buf length. + * This also indicates that the buffer is XDR encoded already. + */ + xbufp->len = xbufp->head[0].iov_len + xbufp->page_len + + xbufp->tail[0].iov_len; + req->rq_bytes_sent = 0; +} +#endif + static void xprt_init(struct rpc_xprt *xprt, struct net *net) { kref_init(&xprt->kref); spin_lock_init(&xprt->transport_lock); spin_lock_init(&xprt->reserve_lock); - spin_lock_init(&xprt->recv_lock); + spin_lock_init(&xprt->queue_lock); INIT_LIST_HEAD(&xprt->free); - INIT_LIST_HEAD(&xprt->recv); + xprt->recv_queue = RB_ROOT; + INIT_LIST_HEAD(&xprt->xmit_queue); #if defined(CONFIG_SUNRPC_BACKCHANNEL) spin_lock_init(&xprt->bc_pa_lock); INIT_LIST_HEAD(&xprt->bc_pa_list); @@ -1407,7 +1793,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) rpc_init_wait_queue(&xprt->binding, "xprt_binding"); rpc_init_wait_queue(&xprt->pending, "xprt_pending"); - rpc_init_priority_wait_queue(&xprt->sending, "xprt_sending"); + rpc_init_wait_queue(&xprt->sending, "xprt_sending"); rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog"); xprt_init_xid(xprt); diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c @@ -51,12 +51,11 @@ static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, rqst = &req->rl_slot; rqst->rq_xprt = xprt; - INIT_LIST_HEAD(&rqst->rq_list); INIT_LIST_HEAD(&rqst->rq_bc_list); __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); size = r_xprt->rx_data.inline_rsize; rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); @@ -201,6 +200,9 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) if (!xprt_connected(rqst->rq_xprt)) goto drop_connection; + if (!xprt_request_get_cong(rqst->rq_xprt, rqst)) + return -EBADSLT; + rc = rpcrdma_bc_marshal_reply(rqst); if (rc < 0) goto failed_marshal; @@ -228,16 +230,16 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpc_rqst *rqst, *tmp; - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_for_each_entry_safe(rqst, tmp, &xprt->bc_pa_list, rq_bc_pa_list) { list_del(&rqst->rq_bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); rpcrdma_bc_free_rqst(r_xprt, rqst); - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); } - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); } /** @@ -255,9 +257,9 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) rpcrdma_recv_buffer_put(req->rl_reply); req->rl_reply = NULL; - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); } /** diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c @@ -49,46 +49,7 @@ fmr_is_supported(struct rpcrdma_ia *ia) return true; } -static int -fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) -{ - static struct ib_fmr_attr fmr_attr = { - .max_pages = RPCRDMA_MAX_FMR_SGES, - .max_maps = 1, - .page_shift = PAGE_SHIFT - }; - - mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(u64), GFP_KERNEL); - if (!mr->fmr.fm_physaddrs) - goto out_free; - - mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(*mr->mr_sg), GFP_KERNEL); - if (!mr->mr_sg) - goto out_free; - - sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); - - mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, - &fmr_attr); - if (IS_ERR(mr->fmr.fm_mr)) - goto out_fmr_err; - - INIT_LIST_HEAD(&mr->mr_list); - return 0; - -out_fmr_err: - dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, - PTR_ERR(mr->fmr.fm_mr)); - -out_free: - kfree(mr->mr_sg); - kfree(mr->fmr.fm_physaddrs); - return -ENOMEM; -} - -static int +static void __fmr_unmap(struct rpcrdma_mr *mr) { LIST_HEAD(l); @@ -97,13 +58,16 @@ __fmr_unmap(struct rpcrdma_mr *mr) list_add(&mr->fmr.fm_mr->list, &l); rc = ib_unmap_fmr(&l); list_del(&mr->fmr.fm_mr->list); - return rc; + if (rc) + pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", + mr, rc); } +/* Release an MR. + */ static void fmr_op_release_mr(struct rpcrdma_mr *mr) { - LIST_HEAD(unmap_list); int rc; kfree(mr->fmr.fm_physaddrs); @@ -112,10 +76,7 @@ fmr_op_release_mr(struct rpcrdma_mr *mr) /* In case this one was left mapped, try to unmap it * to prevent dealloc_fmr from failing with EBUSY */ - rc = __fmr_unmap(mr); - if (rc) - pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", - mr, rc); + __fmr_unmap(mr); rc = ib_dealloc_fmr(mr->fmr.fm_mr); if (rc) @@ -125,40 +86,68 @@ fmr_op_release_mr(struct rpcrdma_mr *mr) kfree(mr); } -/* Reset of a single FMR. +/* MRs are dynamically allocated, so simply clean up and release the MR. + * A replacement MR will subsequently be allocated on demand. */ static void -fmr_op_recover_mr(struct rpcrdma_mr *mr) +fmr_mr_recycle_worker(struct work_struct *work) { + struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - int rc; - /* ORDER: invalidate first */ - rc = __fmr_unmap(mr); - if (rc) - goto out_release; - - /* ORDER: then DMA unmap */ - rpcrdma_mr_unmap_and_put(mr); + trace_xprtrdma_mr_recycle(mr); - r_xprt->rx_stats.mrs_recovered++; - return; - -out_release: - pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr); - r_xprt->rx_stats.mrs_orphaned++; - - trace_xprtrdma_dma_unmap(mr); + trace_xprtrdma_mr_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mr->mr_sg, mr->mr_nents, mr->mr_dir); spin_lock(&r_xprt->rx_buf.rb_mrlock); list_del(&mr->mr_all); + r_xprt->rx_stats.mrs_recycled++; spin_unlock(&r_xprt->rx_buf.rb_mrlock); - fmr_op_release_mr(mr); } +static int +fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) +{ + static struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_FMR_SGES, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + + mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(u64), GFP_KERNEL); + if (!mr->fmr.fm_physaddrs) + goto out_free; + + mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(*mr->mr_sg), GFP_KERNEL); + if (!mr->mr_sg) + goto out_free; + + sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); + + mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, + &fmr_attr); + if (IS_ERR(mr->fmr.fm_mr)) + goto out_fmr_err; + + INIT_LIST_HEAD(&mr->mr_list); + INIT_WORK(&mr->mr_recycle, fmr_mr_recycle_worker); + return 0; + +out_fmr_err: + dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, + PTR_ERR(mr->fmr.fm_mr)); + +out_free: + kfree(mr->mr_sg); + kfree(mr->fmr.fm_physaddrs); + return -ENOMEM; +} + /* On success, sets: * ep->rep_attr.cap.max_send_wr * ep->rep_attr.cap.max_recv_wr @@ -187,6 +176,7 @@ fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES); + ia->ri_max_segs += 2; /* segments for head and tail buffers */ return 0; } @@ -244,7 +234,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr->mr_sg, i, mr->mr_dir); if (!mr->mr_nents) goto out_dmamap_err; - trace_xprtrdma_dma_map(mr); + trace_xprtrdma_mr_map(mr); for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++) dma_pages[i] = sg_dma_address(&mr->mr_sg[i]); @@ -305,13 +295,13 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) { dprintk("RPC: %s: unmapping fmr %p\n", __func__, &mr->fmr); - trace_xprtrdma_localinv(mr); + trace_xprtrdma_mr_localinv(mr); list_add_tail(&mr->fmr.fm_mr->list, &unmap_list); } r_xprt->rx_stats.local_inv_needed++; rc = ib_unmap_fmr(&unmap_list); if (rc) - goto out_reset; + goto out_release; /* ORDER: Now DMA unmap all of the req's MRs, and return * them to the free MW list. @@ -324,13 +314,13 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) return; -out_reset: +out_release: pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); while (!list_empty(mrs)) { mr = rpcrdma_mr_pop(mrs); list_del(&mr->fmr.fm_mr->list); - fmr_op_recover_mr(mr); + rpcrdma_mr_recycle(mr); } } @@ -338,7 +328,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_send = fmr_op_send, .ro_unmap_sync = fmr_op_unmap_sync, - .ro_recover_mr = fmr_op_recover_mr, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, .ro_init_mr = fmr_op_init_mr, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c @@ -97,6 +97,44 @@ out_not_supported: return false; } +static void +frwr_op_release_mr(struct rpcrdma_mr *mr) +{ + int rc; + + rc = ib_dereg_mr(mr->frwr.fr_mr); + if (rc) + pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", + mr, rc); + kfree(mr->mr_sg); + kfree(mr); +} + +/* MRs are dynamically allocated, so simply clean up and release the MR. + * A replacement MR will subsequently be allocated on demand. + */ +static void +frwr_mr_recycle_worker(struct work_struct *work) +{ + struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); + enum rpcrdma_frwr_state state = mr->frwr.fr_state; + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + + trace_xprtrdma_mr_recycle(mr); + + if (state != FRWR_FLUSHED_LI) { + trace_xprtrdma_mr_unmap(mr); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir); + } + + spin_lock(&r_xprt->rx_buf.rb_mrlock); + list_del(&mr->mr_all); + r_xprt->rx_stats.mrs_recycled++; + spin_unlock(&r_xprt->rx_buf.rb_mrlock); + frwr_op_release_mr(mr); +} + static int frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) { @@ -113,6 +151,7 @@ frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) goto out_list_err; INIT_LIST_HEAD(&mr->mr_list); + INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker); sg_init_table(mr->mr_sg, depth); init_completion(&frwr->fr_linv_done); return 0; @@ -131,79 +170,6 @@ out_list_err: return rc; } -static void -frwr_op_release_mr(struct rpcrdma_mr *mr) -{ - int rc; - - rc = ib_dereg_mr(mr->frwr.fr_mr); - if (rc) - pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", - mr, rc); - kfree(mr->mr_sg); - kfree(mr); -} - -static int -__frwr_mr_reset(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) -{ - struct rpcrdma_frwr *frwr = &mr->frwr; - int rc; - - rc = ib_dereg_mr(frwr->fr_mr); - if (rc) { - pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n", - rc, mr); - return rc; - } - - frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, - ia->ri_max_frwr_depth); - if (IS_ERR(frwr->fr_mr)) { - pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n", - PTR_ERR(frwr->fr_mr), mr); - return PTR_ERR(frwr->fr_mr); - } - - dprintk("RPC: %s: recovered FRWR %p\n", __func__, frwr); - frwr->fr_state = FRWR_IS_INVALID; - return 0; -} - -/* Reset of a single FRWR. Generate a fresh rkey by replacing the MR. - */ -static void -frwr_op_recover_mr(struct rpcrdma_mr *mr) -{ - enum rpcrdma_frwr_state state = mr->frwr.fr_state; - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int rc; - - rc = __frwr_mr_reset(ia, mr); - if (state != FRWR_FLUSHED_LI) { - trace_xprtrdma_dma_unmap(mr); - ib_dma_unmap_sg(ia->ri_device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - } - if (rc) - goto out_release; - - rpcrdma_mr_put(mr); - r_xprt->rx_stats.mrs_recovered++; - return; - -out_release: - pr_err("rpcrdma: FRWR reset failed %d, %p released\n", rc, mr); - r_xprt->rx_stats.mrs_orphaned++; - - spin_lock(&r_xprt->rx_buf.rb_mrlock); - list_del(&mr->mr_all); - spin_unlock(&r_xprt->rx_buf.rb_mrlock); - - frwr_op_release_mr(mr); -} - /* On success, sets: * ep->rep_attr.cap.max_send_wr * ep->rep_attr.cap.max_recv_wr @@ -276,6 +242,7 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / ia->ri_max_frwr_depth); + ia->ri_max_segs += 2; /* segments for head and tail buffers */ return 0; } @@ -384,7 +351,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr = NULL; do { if (mr) - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); mr = rpcrdma_mr_get(r_xprt); if (!mr) return ERR_PTR(-EAGAIN); @@ -417,7 +384,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir); if (!mr->mr_nents) goto out_dmamap_err; - trace_xprtrdma_dma_map(mr); + trace_xprtrdma_mr_map(mr); ibmr = frwr->fr_mr; n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); @@ -451,7 +418,7 @@ out_dmamap_err: out_mapmr_err: pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", frwr->fr_mr, n, mr->mr_nents); - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); return ERR_PTR(-EIO); } @@ -499,7 +466,7 @@ frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); - trace_xprtrdma_remoteinv(mr); + trace_xprtrdma_mr_remoteinv(mr); mr->frwr.fr_state = FRWR_IS_INVALID; rpcrdma_mr_unmap_and_put(mr); break; /* only one invalidated MR per RPC */ @@ -536,7 +503,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) mr->frwr.fr_state = FRWR_IS_INVALID; frwr = &mr->frwr; - trace_xprtrdma_localinv(mr); + trace_xprtrdma_mr_localinv(mr); frwr->fr_cqe.done = frwr_wc_localinv; last = &frwr->fr_invwr; @@ -570,7 +537,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) if (bad_wr != first) wait_for_completion(&frwr->fr_linv_done); if (rc) - goto reset_mrs; + goto out_release; /* ORDER: Now DMA unmap all of the MRs, and return * them to the free MR list. @@ -582,22 +549,21 @@ unmap: } return; -reset_mrs: +out_release: pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc); - /* Find and reset the MRs in the LOCAL_INV WRs that did not + /* Unmap and release the MRs in the LOCAL_INV WRs that did not * get posted. */ while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); mr = container_of(frwr, struct rpcrdma_mr, frwr); - - __frwr_mr_reset(ia, mr); - bad_wr = bad_wr->next; + + list_del(&mr->mr_list); + frwr_op_release_mr(mr); } - goto unmap; } const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { @@ -605,7 +571,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_send = frwr_op_send, .ro_reminv = frwr_op_reminv, .ro_unmap_sync = frwr_op_unmap_sync, - .ro_recover_mr = frwr_op_recover_mr, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, .ro_init_mr = frwr_op_init_mr, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -71,7 +71,6 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) size = RPCRDMA_HDRLEN_MIN; /* Maximum Read list size */ - maxsegs += 2; /* segment for head and tail buffers */ size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); /* Minimal Read chunk size */ @@ -97,7 +96,6 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) size = RPCRDMA_HDRLEN_MIN; /* Maximum Write list size */ - maxsegs += 2; /* segment for head and tail buffers */ size = sizeof(__be32); /* segment count */ size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ @@ -805,7 +803,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) struct rpcrdma_mr *mr; mr = rpcrdma_mr_pop(&req->rl_registered); - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); } /* This implementation supports the following combinations @@ -866,7 +864,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) out_err: switch (ret) { case -EAGAIN: - xprt_wait_for_buffer_space(rqst->rq_task, NULL); + xprt_wait_for_buffer_space(rqst->rq_xprt); break; case -ENOBUFS: break; @@ -1216,7 +1214,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct rpc_rqst *rqst = rep->rr_rqst; - unsigned long cwnd; int status; xprt->reestablish_timeout = 0; @@ -1238,15 +1235,10 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) goto out_badheader; out: - spin_lock(&xprt->recv_lock); - cwnd = xprt->cwnd; - xprt->cwnd = r_xprt->rx_buf.rb_credits << RPC_CWNDSHIFT; - if (xprt->cwnd > cwnd) - xprt_release_rqst_cong(rqst->rq_task); - + spin_lock(&xprt->queue_lock); xprt_complete_rqst(rqst->rq_task, status); xprt_unpin_rqst(rqst); - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); return; /* If the incoming reply terminated a pending RPC, the next @@ -1345,19 +1337,23 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) /* Match incoming rpcrdma_rep to an rpcrdma_req to * get context for handling any incoming chunks. */ - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); rqst = xprt_lookup_rqst(xprt, rep->rr_xid); if (!rqst) goto out_norqst; xprt_pin_rqst(rqst); + spin_unlock(&xprt->queue_lock); if (credits == 0) credits = 1; /* don't deadlock */ else if (credits > buf->rb_max_requests) credits = buf->rb_max_requests; - buf->rb_credits = credits; - - spin_unlock(&xprt->recv_lock); + if (buf->rb_credits != credits) { + spin_lock_bh(&xprt->transport_lock); + buf->rb_credits = credits; + xprt->cwnd = credits << RPC_CWNDSHIFT; + spin_unlock_bh(&xprt->transport_lock); + } req = rpcr_to_rdmar(rqst); req->rl_reply = rep; @@ -1378,7 +1374,7 @@ out_badversion: * is corrupt. */ out_norqst: - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); trace_xprtrdma_reply_rqst(rep); goto repost; diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -56,7 +56,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, if (src->iov_len < 24) goto out_shortreply; - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); req = xprt_lookup_rqst(xprt, xid); if (!req) goto out_notfound; @@ -86,7 +86,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, rcvbuf->len = 0; out_unlock: - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); out: return ret; @@ -215,9 +215,8 @@ drop_connection: * connection. */ static int -xprt_rdma_bc_send_request(struct rpc_task *task) +xprt_rdma_bc_send_request(struct rpc_rqst *rqst) { - struct rpc_rqst *rqst = task->tk_rqstp; struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; struct svcxprt_rdma *rdma; int ret; @@ -225,12 +224,7 @@ xprt_rdma_bc_send_request(struct rpc_task *task) dprintk("svcrdma: sending bc call with xid: %08x\n", be32_to_cpu(rqst->rq_xid)); - if (!mutex_trylock(&sxprt->xpt_mutex)) { - rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL); - if (!mutex_trylock(&sxprt->xpt_mutex)) - return -EAGAIN; - rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task); - } + mutex_lock(&sxprt->xpt_mutex); ret = -ENOTCONN; rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); @@ -248,6 +242,7 @@ static void xprt_rdma_bc_close(struct rpc_xprt *xprt) { dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); + xprt->cwnd = RPC_CWNDSHIFT; } static void diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c @@ -225,69 +225,59 @@ xprt_rdma_free_addresses(struct rpc_xprt *xprt) } } -void -rpcrdma_conn_func(struct rpcrdma_ep *ep) -{ - schedule_delayed_work(&ep->rep_connect_worker, 0); -} - -void -rpcrdma_connect_worker(struct work_struct *work) -{ - struct rpcrdma_ep *ep = - container_of(work, struct rpcrdma_ep, rep_connect_worker.work); - struct rpcrdma_xprt *r_xprt = - container_of(ep, struct rpcrdma_xprt, rx_ep); - struct rpc_xprt *xprt = &r_xprt->rx_xprt; - - spin_lock_bh(&xprt->transport_lock); - if (ep->rep_connected > 0) { - if (!xprt_test_and_set_connected(xprt)) - xprt_wake_pending_tasks(xprt, 0); - } else { - if (xprt_test_and_clear_connected(xprt)) - xprt_wake_pending_tasks(xprt, -ENOTCONN); - } - spin_unlock_bh(&xprt->transport_lock); -} - +/** + * xprt_rdma_connect_worker - establish connection in the background + * @work: worker thread context + * + * Requester holds the xprt's send lock to prevent activity on this + * transport while a fresh connection is being established. RPC tasks + * sleep on the xprt's pending queue waiting for connect to complete. + */ static void xprt_rdma_connect_worker(struct work_struct *work) { struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, rx_connect_worker.work); struct rpc_xprt *xprt = &r_xprt->rx_xprt; - int rc = 0; - - xprt_clear_connected(xprt); + int rc; rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); - if (rc) - xprt_wake_pending_tasks(xprt, rc); - xprt_clear_connecting(xprt); + if (r_xprt->rx_ep.rep_connected > 0) { + if (!xprt_test_and_set_connected(xprt)) { + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; + xprt_wake_pending_tasks(xprt, -EAGAIN); + } + } else { + if (xprt_test_and_clear_connected(xprt)) + xprt_wake_pending_tasks(xprt, rc); + } } +/** + * xprt_rdma_inject_disconnect - inject a connection fault + * @xprt: transport context + * + * If @xprt is connected, disconnect it to simulate spurious connection + * loss. + */ static void xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) { - struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt, - rx_xprt); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); trace_xprtrdma_inject_dsc(r_xprt); rdma_disconnect(r_xprt->rx_ia.ri_id); } -/* - * xprt_rdma_destroy +/** + * xprt_rdma_destroy - Full tear down of transport + * @xprt: doomed transport context * - * Destroy the xprt. - * Free all memory associated with the object, including its own. - * NOTE: none of the *destroy methods free memory for their top-level - * objects, even though they may have allocated it (they do free - * private memory). It's up to the caller to handle it. In this - * case (RDMA transport), all structure memory is inlined with the - * struct rpcrdma_xprt. + * Caller guarantees there will be no more calls to us with + * this @xprt. */ static void xprt_rdma_destroy(struct rpc_xprt *xprt) @@ -298,8 +288,6 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) cancel_delayed_work_sync(&r_xprt->rx_connect_worker); - xprt_clear_connected(xprt); - rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); rpcrdma_buffer_destroy(&r_xprt->rx_buf); rpcrdma_ia_close(&r_xprt->rx_ia); @@ -442,11 +430,12 @@ out1: } /** - * xprt_rdma_close - Close down RDMA connection - * @xprt: generic transport to be closed + * xprt_rdma_close - close a transport connection + * @xprt: transport context * - * Called during transport shutdown reconnect, or device - * removal. Caller holds the transport's write lock. + * Called during transport shutdown, reconnect, or device removal. + * Caller holds @xprt's send lock to prevent activity on this + * transport while the connection is torn down. */ static void xprt_rdma_close(struct rpc_xprt *xprt) @@ -468,6 +457,12 @@ xprt_rdma_close(struct rpc_xprt *xprt) xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); rpcrdma_ep_disconnect(ep, ia); + + /* Prepare @xprt for the next connection by reinitializing + * its credit grant to one (see RFC 8166, Section 3.3.3). + */ + r_xprt->rx_buf.rb_credits = 1; + xprt->cwnd = RPC_CWNDSHIFT; } /** @@ -519,6 +514,12 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) xprt_force_disconnect(xprt); } +/** + * xprt_rdma_connect - try to establish a transport connection + * @xprt: transport state + * @task: RPC scheduler context + * + */ static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { @@ -638,13 +639,6 @@ rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, * 0: Success; rq_buffer points to RPC buffer to use * ENOMEM: Out of memory, call again later * EIO: A permanent error occurred, do not retry - * - * The RDMA allocate/free functions need the task structure as a place - * to hide the struct rpcrdma_req, which is necessary for the actual - * send/recv sequence. - * - * xprt_rdma_allocate provides buffers that are already mapped for - * DMA, and a local DMA lkey is provided for each. */ static int xprt_rdma_allocate(struct rpc_task *task) @@ -693,7 +687,7 @@ xprt_rdma_free(struct rpc_task *task) /** * xprt_rdma_send_request - marshal and send an RPC request - * @task: RPC task with an RPC message in rq_snd_buf + * @rqst: RPC message in rq_snd_buf * * Caller holds the transport's write lock. * @@ -706,9 +700,8 @@ xprt_rdma_free(struct rpc_task *task) * sent. Do not try to send this message again. */ static int -xprt_rdma_send_request(struct rpc_task *task) +xprt_rdma_send_request(struct rpc_rqst *rqst) { - struct rpc_rqst *rqst = task->tk_rqstp; struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); @@ -722,6 +715,9 @@ xprt_rdma_send_request(struct rpc_task *task) if (!xprt_connected(xprt)) goto drop_connection; + if (!xprt_request_get_cong(xprt, rqst)) + return -EBADSLT; + rc = rpcrdma_marshal_req(r_xprt, rqst); if (rc < 0) goto failed_marshal; @@ -741,7 +737,7 @@ xprt_rdma_send_request(struct rpc_task *task) /* An RPC with no reply will throw off credit accounting, * so drop the connection to reset the credit grant. */ - if (!rpc_reply_expected(task)) + if (!rpc_reply_expected(rqst->rq_task)) goto drop_connection; return 0; @@ -766,7 +762,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) 0, /* need a local port? */ xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, @@ -786,7 +782,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) r_xprt->rx_stats.bad_reply_count, r_xprt->rx_stats.nomsg_call_count); seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n", - r_xprt->rx_stats.mrs_recovered, + r_xprt->rx_stats.mrs_recycled, r_xprt->rx_stats.mrs_orphaned, r_xprt->rx_stats.mrs_allocated, r_xprt->rx_stats.local_inv_needed, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c @@ -108,20 +108,48 @@ rpcrdma_destroy_wq(void) } } +/** + * rpcrdma_disconnect_worker - Force a disconnect + * @work: endpoint to be disconnected + * + * Provider callbacks can possibly run in an IRQ context. This function + * is invoked in a worker thread to guarantee that disconnect wake-up + * calls are always done in process context. + */ +static void +rpcrdma_disconnect_worker(struct work_struct *work) +{ + struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep, + rep_disconnect_worker.work); + struct rpcrdma_xprt *r_xprt = + container_of(ep, struct rpcrdma_xprt, rx_ep); + + xprt_force_disconnect(&r_xprt->rx_xprt); +} + +/** + * rpcrdma_qp_event_handler - Handle one QP event (error notification) + * @event: details of the event + * @context: ep that owns QP where event occurred + * + * Called from the RDMA provider (device driver) possibly in an interrupt + * context. + */ static void -rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) +rpcrdma_qp_event_handler(struct ib_event *event, void *context) { struct rpcrdma_ep *ep = context; struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, rx_ep); - trace_xprtrdma_qp_error(r_xprt, event); - pr_err("rpcrdma: %s on device %s ep %p\n", - ib_event_msg(event->event), event->device->name, context); + trace_xprtrdma_qp_event(r_xprt, event); + pr_err("rpcrdma: %s on device %s connected to %s:%s\n", + ib_event_msg(event->event), event->device->name, + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); if (ep->rep_connected == 1) { ep->rep_connected = -EIO; - rpcrdma_conn_func(ep); + schedule_delayed_work(&ep->rep_disconnect_worker, 0); wake_up_all(&ep->rep_connect_wait); } } @@ -219,38 +247,48 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, rpcrdma_set_max_header_sizes(r_xprt); } +/** + * rpcrdma_cm_event_handler - Handle RDMA CM events + * @id: rdma_cm_id on which an event has occurred + * @event: details of the event + * + * Called with @id's mutex held. Returns 1 if caller should + * destroy @id, otherwise 0. + */ static int -rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) +rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { - struct rpcrdma_xprt *xprt = id->context; - struct rpcrdma_ia *ia = &xprt->rx_ia; - struct rpcrdma_ep *ep = &xprt->rx_ep; - int connstate = 0; + struct rpcrdma_xprt *r_xprt = id->context; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + + might_sleep(); - trace_xprtrdma_conn_upcall(xprt, event); + trace_xprtrdma_cm_event(r_xprt, event); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: ia->ri_async_rc = 0; complete(&ia->ri_done); - break; + return 0; case RDMA_CM_EVENT_ADDR_ERROR: ia->ri_async_rc = -EPROTO; complete(&ia->ri_done); - break; + return 0; case RDMA_CM_EVENT_ROUTE_ERROR: ia->ri_async_rc = -ENETUNREACH; complete(&ia->ri_done); - break; + return 0; case RDMA_CM_EVENT_DEVICE_REMOVAL: #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) pr_info("rpcrdma: removing device %s for %s:%s\n", ia->ri_device->name, - rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt)); + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); #endif set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); ep->rep_connected = -ENODEV; - xprt_force_disconnect(&xprt->rx_xprt); + xprt_force_disconnect(xprt); wait_for_completion(&ia->ri_remove_done); ia->ri_id = NULL; @@ -258,41 +296,40 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) /* Return 1 to ensure the core destroys the id. */ return 1; case RDMA_CM_EVENT_ESTABLISHED: - ++xprt->rx_xprt.connect_cookie; - connstate = 1; - rpcrdma_update_connect_private(xprt, &event->param.conn); - goto connected; + ++xprt->connect_cookie; + ep->rep_connected = 1; + rpcrdma_update_connect_private(r_xprt, &event->param.conn); + wake_up_all(&ep->rep_connect_wait); + break; case RDMA_CM_EVENT_CONNECT_ERROR: - connstate = -ENOTCONN; - goto connected; + ep->rep_connected = -ENOTCONN; + goto disconnected; case RDMA_CM_EVENT_UNREACHABLE: - connstate = -ENETUNREACH; - goto connected; + ep->rep_connected = -ENETUNREACH; + goto disconnected; case RDMA_CM_EVENT_REJECTED: dprintk("rpcrdma: connection to %s:%s rejected: %s\n", - rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), rdma_reject_msg(id, event->status)); - connstate = -ECONNREFUSED; + ep->rep_connected = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) - connstate = -EAGAIN; - goto connected; + ep->rep_connected = -EAGAIN; + goto disconnected; case RDMA_CM_EVENT_DISCONNECTED: - ++xprt->rx_xprt.connect_cookie; - connstate = -ECONNABORTED; -connected: - ep->rep_connected = connstate; - rpcrdma_conn_func(ep); + ++xprt->connect_cookie; + ep->rep_connected = -ECONNABORTED; +disconnected: + xprt_force_disconnect(xprt); wake_up_all(&ep->rep_connect_wait); - /*FALLTHROUGH*/ + break; default: - dprintk("RPC: %s: %s:%s on %s/%s (ep 0x%p): %s\n", - __func__, - rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), - ia->ri_device->name, ia->ri_ops->ro_displayname, - ep, rdma_event_msg(event->event)); break; } + dprintk("RPC: %s: %s:%s on %s/%s: %s\n", __func__, + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), + ia->ri_device->name, ia->ri_ops->ro_displayname, + rdma_event_msg(event->event)); return 0; } @@ -308,7 +345,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) init_completion(&ia->ri_done); init_completion(&ia->ri_remove_done); - id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_conn_upcall, + id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler, xprt, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) { rc = PTR_ERR(id); @@ -519,7 +556,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, if (rc) return rc; - ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; + ep->rep_attr.event_handler = rpcrdma_qp_event_handler; ep->rep_attr.qp_context = ep; ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_sge = max_sge; @@ -542,7 +579,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, cdata->max_requests >> 2); ep->rep_send_count = ep->rep_send_batch; init_waitqueue_head(&ep->rep_connect_wait); - INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); + INIT_DELAYED_WORK(&ep->rep_disconnect_worker, + rpcrdma_disconnect_worker); sendcq = ib_alloc_cq(ia->ri_device, NULL, ep->rep_attr.cap.max_send_wr + 1, @@ -615,7 +653,7 @@ out1: void rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - cancel_delayed_work_sync(&ep->rep_connect_worker); + cancel_delayed_work_sync(&ep->rep_disconnect_worker); if (ia->ri_id && ia->ri_id->qp) { rpcrdma_ep_disconnect(ep, ia); @@ -728,6 +766,7 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); + struct rpc_xprt *xprt = &r_xprt->rx_xprt; int rc; retry: @@ -754,6 +793,8 @@ retry: } ep->rep_connected = 0; + xprt_clear_connected(xprt); + rpcrdma_post_recvs(r_xprt, true); rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); @@ -877,7 +918,6 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) sc->sc_xprt = r_xprt; buf->rb_sc_ctxs[i] = sc; } - buf->rb_flags = 0; return 0; @@ -978,39 +1018,6 @@ rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) } static void -rpcrdma_mr_recovery_worker(struct work_struct *work) -{ - struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, - rb_recovery_worker.work); - struct rpcrdma_mr *mr; - - spin_lock(&buf->rb_recovery_lock); - while (!list_empty(&buf->rb_stale_mrs)) { - mr = rpcrdma_mr_pop(&buf->rb_stale_mrs); - spin_unlock(&buf->rb_recovery_lock); - - trace_xprtrdma_recover_mr(mr); - mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr); - - spin_lock(&buf->rb_recovery_lock); - } - spin_unlock(&buf->rb_recovery_lock); -} - -void -rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr) -{ - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - - spin_lock(&buf->rb_recovery_lock); - rpcrdma_mr_push(mr, &buf->rb_stale_mrs); - spin_unlock(&buf->rb_recovery_lock); - - schedule_delayed_work(&buf->rb_recovery_worker, 0); -} - -static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; @@ -1019,7 +1026,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) LIST_HEAD(free); LIST_HEAD(all); - for (count = 0; count < 3; count++) { + for (count = 0; count < ia->ri_max_segs; count++) { struct rpcrdma_mr *mr; int rc; @@ -1138,18 +1145,15 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; int i, rc; + buf->rb_flags = 0; buf->rb_max_requests = r_xprt->rx_data.max_requests; buf->rb_bc_srv_max_requests = 0; spin_lock_init(&buf->rb_mrlock); spin_lock_init(&buf->rb_lock); - spin_lock_init(&buf->rb_recovery_lock); INIT_LIST_HEAD(&buf->rb_mrs); INIT_LIST_HEAD(&buf->rb_all); - INIT_LIST_HEAD(&buf->rb_stale_mrs); INIT_DELAYED_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); - INIT_DELAYED_WORK(&buf->rb_recovery_worker, - rpcrdma_mr_recovery_worker); rpcrdma_mrs_create(r_xprt); @@ -1233,7 +1237,6 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - cancel_delayed_work_sync(&buf->rb_recovery_worker); cancel_delayed_work_sync(&buf->rb_refresh_worker); rpcrdma_sendctxs_destroy(buf); @@ -1326,7 +1329,7 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) { struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - trace_xprtrdma_dma_unmap(mr); + trace_xprtrdma_mr_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mr->mr_sg, mr->mr_nents, mr->mr_dir); __rpcrdma_mr_put(&r_xprt->rx_buf, mr); @@ -1518,9 +1521,11 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) struct ib_recv_wr *wr, *bad_wr; int needed, count, rc; + rc = 0; + count = 0; needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); if (buf->rb_posted_receives > needed) - return; + goto out; needed -= buf->rb_posted_receives; count = 0; @@ -1556,7 +1561,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) --needed; } if (!count) - return; + goto out; rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); @@ -1570,5 +1575,6 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) } } buf->rb_posted_receives += count; +out: trace_xprtrdma_post_recvs(r_xprt, count, rc); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -101,7 +101,7 @@ struct rpcrdma_ep { wait_queue_head_t rep_connect_wait; struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; - struct delayed_work rep_connect_worker; + struct delayed_work rep_disconnect_worker; }; /* Pre-allocate extra Work Requests for handling backward receives @@ -280,6 +280,7 @@ struct rpcrdma_mr { u32 mr_handle; u32 mr_length; u64 mr_offset; + struct work_struct mr_recycle; struct list_head mr_all; }; @@ -411,9 +412,6 @@ struct rpcrdma_buffer { u32 rb_bc_max_requests; - spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */ - struct list_head rb_stale_mrs; - struct delayed_work rb_recovery_worker; struct delayed_work rb_refresh_worker; }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) @@ -452,7 +450,7 @@ struct rpcrdma_stats { unsigned long hardway_register_count; unsigned long failed_marshal_count; unsigned long bad_reply_count; - unsigned long mrs_recovered; + unsigned long mrs_recycled; unsigned long mrs_orphaned; unsigned long mrs_allocated; unsigned long empty_sendctx_q; @@ -481,7 +479,6 @@ struct rpcrdma_memreg_ops { struct list_head *mrs); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct list_head *); - void (*ro_recover_mr)(struct rpcrdma_mr *mr); int (*ro_open)(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_create_data_internal *); @@ -559,7 +556,6 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, struct rpcrdma_create_data_internal *); void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); -void rpcrdma_conn_func(struct rpcrdma_ep *ep); void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, @@ -578,7 +574,12 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); void rpcrdma_mr_put(struct rpcrdma_mr *mr); void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr); -void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr); + +static inline void +rpcrdma_mr_recycle(struct rpcrdma_mr *mr) +{ + schedule_work(&mr->mr_recycle); +} struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_req *); @@ -652,7 +653,6 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) extern unsigned int xprt_rdma_max_inline_read; void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); void xprt_rdma_free_addresses(struct rpc_xprt *xprt); -void rpcrdma_connect_worker(struct work_struct *work); void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); int xprt_rdma_init(void); void xprt_rdma_cleanup(void); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c @@ -47,13 +47,13 @@ #include <net/checksum.h> #include <net/udp.h> #include <net/tcp.h> +#include <linux/bvec.h> +#include <linux/uio.h> #include <trace/events/sunrpc.h> #include "sunrpc.h" -#define RPC_TCP_READ_CHUNK_SZ (3*512*1024) - static void xs_close(struct rpc_xprt *xprt); static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock); @@ -129,7 +129,7 @@ static struct ctl_table xs_tunables_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &xprt_min_resvport_limit, - .extra2 = &xprt_max_resvport + .extra2 = &xprt_max_resvport_limit }, { .procname = "max_resvport", @@ -137,7 +137,7 @@ static struct ctl_table xs_tunables_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &xprt_min_resvport, + .extra1 = &xprt_min_resvport_limit, .extra2 = &xprt_max_resvport_limit }, { @@ -325,6 +325,362 @@ static void xs_free_peer_addresses(struct rpc_xprt *xprt) } } +static size_t +xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp) +{ + size_t i,n; + + if (!(buf->flags & XDRBUF_SPARSE_PAGES)) + return want; + if (want > buf->page_len) + want = buf->page_len; + n = (buf->page_base + want + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < n; i++) { + if (buf->pages[i]) + continue; + buf->bvec[i].bv_page = buf->pages[i] = alloc_page(gfp); + if (!buf->pages[i]) { + buf->page_len = (i * PAGE_SIZE) - buf->page_base; + return buf->page_len; + } + } + return want; +} + +static ssize_t +xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek) +{ + ssize_t ret; + if (seek != 0) + iov_iter_advance(&msg->msg_iter, seek); + ret = sock_recvmsg(sock, msg, flags); + return ret > 0 ? ret + seek : ret; +} + +static ssize_t +xs_read_kvec(struct socket *sock, struct msghdr *msg, int flags, + struct kvec *kvec, size_t count, size_t seek) +{ + iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, kvec, 1, count); + return xs_sock_recvmsg(sock, msg, flags, seek); +} + +static ssize_t +xs_read_bvec(struct socket *sock, struct msghdr *msg, int flags, + struct bio_vec *bvec, unsigned long nr, size_t count, + size_t seek) +{ + iov_iter_bvec(&msg->msg_iter, READ | ITER_BVEC, bvec, nr, count); + return xs_sock_recvmsg(sock, msg, flags, seek); +} + +static ssize_t +xs_read_discard(struct socket *sock, struct msghdr *msg, int flags, + size_t count) +{ + struct kvec kvec = { 0 }; + return xs_read_kvec(sock, msg, flags | MSG_TRUNC, &kvec, count, 0); +} + +static ssize_t +xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, + struct xdr_buf *buf, size_t count, size_t seek, size_t *read) +{ + size_t want, seek_init = seek, offset = 0; + ssize_t ret; + + if (seek < buf->head[0].iov_len) { + want = min_t(size_t, count, buf->head[0].iov_len); + ret = xs_read_kvec(sock, msg, flags, &buf->head[0], want, seek); + if (ret <= 0) + goto sock_err; + offset += ret; + if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + goto out; + if (ret != want) + goto eagain; + seek = 0; + } else { + seek -= buf->head[0].iov_len; + offset += buf->head[0].iov_len; + } + if (seek < buf->page_len) { + want = xs_alloc_sparse_pages(buf, + min_t(size_t, count - offset, buf->page_len), + GFP_NOWAIT); + ret = xs_read_bvec(sock, msg, flags, buf->bvec, + xdr_buf_pagecount(buf), + want + buf->page_base, + seek + buf->page_base); + if (ret <= 0) + goto sock_err; + offset += ret - buf->page_base; + if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + goto out; + if (ret != want) + goto eagain; + seek = 0; + } else { + seek -= buf->page_len; + offset += buf->page_len; + } + if (seek < buf->tail[0].iov_len) { + want = min_t(size_t, count - offset, buf->tail[0].iov_len); + ret = xs_read_kvec(sock, msg, flags, &buf->tail[0], want, seek); + if (ret <= 0) + goto sock_err; + offset += ret; + if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + goto out; + if (ret != want) + goto eagain; + } else + offset += buf->tail[0].iov_len; + ret = -EMSGSIZE; + msg->msg_flags |= MSG_TRUNC; +out: + *read = offset - seek_init; + return ret; +eagain: + ret = -EAGAIN; + goto out; +sock_err: + offset += seek; + goto out; +} + +static void +xs_read_header(struct sock_xprt *transport, struct xdr_buf *buf) +{ + if (!transport->recv.copied) { + if (buf->head[0].iov_len >= transport->recv.offset) + memcpy(buf->head[0].iov_base, + &transport->recv.xid, + transport->recv.offset); + transport->recv.copied = transport->recv.offset; + } +} + +static bool +xs_read_stream_request_done(struct sock_xprt *transport) +{ + return transport->recv.fraghdr & cpu_to_be32(RPC_LAST_STREAM_FRAGMENT); +} + +static ssize_t +xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg, + int flags, struct rpc_rqst *req) +{ + struct xdr_buf *buf = &req->rq_private_buf; + size_t want, read; + ssize_t ret; + + xs_read_header(transport, buf); + + want = transport->recv.len - transport->recv.offset; + ret = xs_read_xdr_buf(transport->sock, msg, flags, buf, + transport->recv.copied + want, transport->recv.copied, + &read); + transport->recv.offset += read; + transport->recv.copied += read; + if (transport->recv.offset == transport->recv.len) { + if (xs_read_stream_request_done(transport)) + msg->msg_flags |= MSG_EOR; + return transport->recv.copied; + } + + switch (ret) { + case -EMSGSIZE: + return transport->recv.copied; + case 0: + return -ESHUTDOWN; + default: + if (ret < 0) + return ret; + } + return -EAGAIN; +} + +static size_t +xs_read_stream_headersize(bool isfrag) +{ + if (isfrag) + return sizeof(__be32); + return 3 * sizeof(__be32); +} + +static ssize_t +xs_read_stream_header(struct sock_xprt *transport, struct msghdr *msg, + int flags, size_t want, size_t seek) +{ + struct kvec kvec = { + .iov_base = &transport->recv.fraghdr, + .iov_len = want, + }; + return xs_read_kvec(transport->sock, msg, flags, &kvec, want, seek); +} + +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +static ssize_t +xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) +{ + struct rpc_xprt *xprt = &transport->xprt; + struct rpc_rqst *req; + ssize_t ret; + + /* Look up and lock the request corresponding to the given XID */ + req = xprt_lookup_bc_request(xprt, transport->recv.xid); + if (!req) { + printk(KERN_WARNING "Callback slot table overflowed\n"); + return -ESHUTDOWN; + } + + ret = xs_read_stream_request(transport, msg, flags, req); + if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + xprt_complete_bc_request(req, ret); + + return ret; +} +#else /* CONFIG_SUNRPC_BACKCHANNEL */ +static ssize_t +xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) +{ + return -ESHUTDOWN; +} +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ + +static ssize_t +xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags) +{ + struct rpc_xprt *xprt = &transport->xprt; + struct rpc_rqst *req; + ssize_t ret = 0; + + /* Look up and lock the request corresponding to the given XID */ + spin_lock(&xprt->queue_lock); + req = xprt_lookup_rqst(xprt, transport->recv.xid); + if (!req) { + msg->msg_flags |= MSG_TRUNC; + goto out; + } + xprt_pin_rqst(req); + spin_unlock(&xprt->queue_lock); + + ret = xs_read_stream_request(transport, msg, flags, req); + + spin_lock(&xprt->queue_lock); + if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + xprt_complete_rqst(req->rq_task, ret); + xprt_unpin_rqst(req); +out: + spin_unlock(&xprt->queue_lock); + return ret; +} + +static ssize_t +xs_read_stream(struct sock_xprt *transport, int flags) +{ + struct msghdr msg = { 0 }; + size_t want, read = 0; + ssize_t ret = 0; + + if (transport->recv.len == 0) { + want = xs_read_stream_headersize(transport->recv.copied != 0); + ret = xs_read_stream_header(transport, &msg, flags, want, + transport->recv.offset); + if (ret <= 0) + goto out_err; + transport->recv.offset = ret; + if (ret != want) { + ret = -EAGAIN; + goto out_err; + } + transport->recv.len = be32_to_cpu(transport->recv.fraghdr) & + RPC_FRAGMENT_SIZE_MASK; + transport->recv.offset -= sizeof(transport->recv.fraghdr); + read = ret; + } + + switch (be32_to_cpu(transport->recv.calldir)) { + case RPC_CALL: + ret = xs_read_stream_call(transport, &msg, flags); + break; + case RPC_REPLY: + ret = xs_read_stream_reply(transport, &msg, flags); + } + if (msg.msg_flags & MSG_TRUNC) { + transport->recv.calldir = cpu_to_be32(-1); + transport->recv.copied = -1; + } + if (ret < 0) + goto out_err; + read += ret; + if (transport->recv.offset < transport->recv.len) { + ret = xs_read_discard(transport->sock, &msg, flags, + transport->recv.len - transport->recv.offset); + if (ret <= 0) + goto out_err; + transport->recv.offset += ret; + read += ret; + if (transport->recv.offset != transport->recv.len) + return -EAGAIN; + } + if (xs_read_stream_request_done(transport)) { + trace_xs_stream_read_request(transport); + transport->recv.copied = 0; + } + transport->recv.offset = 0; + transport->recv.len = 0; + return read; +out_err: + switch (ret) { + case 0: + case -ESHUTDOWN: + xprt_force_disconnect(&transport->xprt); + return -ESHUTDOWN; + } + return ret; +} + +static void xs_stream_data_receive(struct sock_xprt *transport) +{ + size_t read = 0; + ssize_t ret = 0; + + mutex_lock(&transport->recv_mutex); + if (transport->sock == NULL) + goto out; + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); + for (;;) { + ret = xs_read_stream(transport, MSG_DONTWAIT); + if (ret <= 0) + break; + read += ret; + cond_resched(); + } +out: + mutex_unlock(&transport->recv_mutex); + trace_xs_stream_read_data(&transport->xprt, ret, read); +} + +static void xs_stream_data_receive_workfn(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, recv_worker); + xs_stream_data_receive(transport); +} + +static void +xs_stream_reset_connect(struct sock_xprt *transport) +{ + transport->recv.offset = 0; + transport->recv.len = 0; + transport->recv.copied = 0; + transport->xmit.offset = 0; + transport->xprt.stat.connect_count++; + transport->xprt.stat.connect_start = jiffies; +} + #define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen, struct kvec *vec, unsigned int base, int more) @@ -440,28 +796,21 @@ out: return err; } -static void xs_nospace_callback(struct rpc_task *task) -{ - struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt); - - transport->inet->sk_write_pending--; -} - /** - * xs_nospace - place task on wait queue if transmit was incomplete - * @task: task to put to sleep + * xs_nospace - handle transmit was incomplete + * @req: pointer to RPC request * */ -static int xs_nospace(struct rpc_task *task) +static int xs_nospace(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct sock *sk = transport->inet; int ret = -EAGAIN; dprintk("RPC: %5u xmit incomplete (%u left of %u)\n", - task->tk_pid, req->rq_slen - req->rq_bytes_sent, + req->rq_task->tk_pid, + req->rq_slen - transport->xmit.offset, req->rq_slen); /* Protect against races with write_space */ @@ -471,7 +820,7 @@ static int xs_nospace(struct rpc_task *task) if (xprt_connected(xprt)) { /* wait for more buffer space */ sk->sk_write_pending++; - xprt_wait_for_buffer_space(task, xs_nospace_callback); + xprt_wait_for_buffer_space(xprt); } else ret = -ENOTCONN; @@ -491,6 +840,22 @@ static int xs_nospace(struct rpc_task *task) return ret; } +static void +xs_stream_prepare_request(struct rpc_rqst *req) +{ + req->rq_task->tk_status = xdr_alloc_bvec(&req->rq_rcv_buf, GFP_NOIO); +} + +/* + * Determine if the previous message in the stream was aborted before it + * could complete transmission. + */ +static bool +xs_send_request_was_aborted(struct sock_xprt *transport, struct rpc_rqst *req) +{ + return transport->xmit.offset != 0 && req->rq_bytes_sent == 0; +} + /* * Construct a stream transport record marker in @buf. */ @@ -503,7 +868,7 @@ static inline void xs_encode_stream_record_marker(struct xdr_buf *buf) /** * xs_local_send_request - write an RPC request to an AF_LOCAL socket - * @task: RPC task that manages the state of an RPC request + * @req: pointer to RPC request * * Return values: * 0: The request has been sent @@ -512,9 +877,8 @@ static inline void xs_encode_stream_record_marker(struct xdr_buf *buf) * ENOTCONN: Caller needs to invoke connect logic then call again * other: Some other error occured, the request was not sent */ -static int xs_local_send_request(struct rpc_task *task) +static int xs_local_send_request(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -522,25 +886,34 @@ static int xs_local_send_request(struct rpc_task *task) int status; int sent = 0; + /* Close the stream if the previous transmission was incomplete */ + if (xs_send_request_was_aborted(transport, req)) { + xs_close(xprt); + return -ENOTCONN; + } + xs_encode_stream_record_marker(&req->rq_snd_buf); xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); req->rq_xtime = ktime_get(); - status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent, + status = xs_sendpages(transport->sock, NULL, 0, xdr, + transport->xmit.offset, true, &sent); dprintk("RPC: %s(%u) = %d\n", - __func__, xdr->len - req->rq_bytes_sent, status); + __func__, xdr->len - transport->xmit.offset, status); if (status == -EAGAIN && sock_writeable(transport->inet)) status = -ENOBUFS; if (likely(sent > 0) || status == 0) { - req->rq_bytes_sent += sent; - req->rq_xmit_bytes_sent += sent; + transport->xmit.offset += sent; + req->rq_bytes_sent = transport->xmit.offset; if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_xmit_bytes_sent += transport->xmit.offset; req->rq_bytes_sent = 0; + transport->xmit.offset = 0; return 0; } status = -EAGAIN; @@ -550,7 +923,7 @@ static int xs_local_send_request(struct rpc_task *task) case -ENOBUFS: break; case -EAGAIN: - status = xs_nospace(task); + status = xs_nospace(req); break; default: dprintk("RPC: sendmsg returned unrecognized error %d\n", @@ -566,7 +939,7 @@ static int xs_local_send_request(struct rpc_task *task) /** * xs_udp_send_request - write an RPC request to a UDP socket - * @task: address of RPC task that manages the state of an RPC request + * @req: pointer to RPC request * * Return values: * 0: The request has been sent @@ -575,9 +948,8 @@ static int xs_local_send_request(struct rpc_task *task) * ENOTCONN: Caller needs to invoke connect logic then call again * other: Some other error occurred, the request was not sent */ -static int xs_udp_send_request(struct rpc_task *task) +static int xs_udp_send_request(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; @@ -590,12 +962,16 @@ static int xs_udp_send_request(struct rpc_task *task) if (!xprt_bound(xprt)) return -ENOTCONN; + + if (!xprt_request_get_cong(xprt, req)) + return -EBADSLT; + req->rq_xtime = ktime_get(); status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen, - xdr, req->rq_bytes_sent, true, &sent); + xdr, 0, true, &sent); dprintk("RPC: xs_udp_send_request(%u) = %d\n", - xdr->len - req->rq_bytes_sent, status); + xdr->len, status); /* firewall is blocking us, don't return -EAGAIN or we end up looping */ if (status == -EPERM) @@ -619,7 +995,7 @@ process_status: /* Should we call xs_close() here? */ break; case -EAGAIN: - status = xs_nospace(task); + status = xs_nospace(req); break; case -ENETUNREACH: case -ENOBUFS: @@ -639,7 +1015,7 @@ process_status: /** * xs_tcp_send_request - write an RPC request to a TCP socket - * @task: address of RPC task that manages the state of an RPC request + * @req: pointer to RPC request * * Return values: * 0: The request has been sent @@ -651,9 +1027,8 @@ process_status: * XXX: In the case of soft timeouts, should we eventually give up * if sendmsg is not able to make progress? */ -static int xs_tcp_send_request(struct rpc_task *task) +static int xs_tcp_send_request(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; @@ -662,6 +1037,13 @@ static int xs_tcp_send_request(struct rpc_task *task) int status; int sent; + /* Close the stream if the previous transmission was incomplete */ + if (xs_send_request_was_aborted(transport, req)) { + if (transport->sock != NULL) + kernel_sock_shutdown(transport->sock, SHUT_RDWR); + return -ENOTCONN; + } + xs_encode_stream_record_marker(&req->rq_snd_buf); xs_pktdump("packet data:", @@ -671,7 +1053,7 @@ static int xs_tcp_send_request(struct rpc_task *task) * completes while the socket holds a reference to the pages, * then we may end up resending corrupted data. */ - if (task->tk_flags & RPC_TASK_SENT) + if (req->rq_task->tk_flags & RPC_TASK_SENT) zerocopy = false; if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state)) @@ -684,17 +1066,20 @@ static int xs_tcp_send_request(struct rpc_task *task) while (1) { sent = 0; status = xs_sendpages(transport->sock, NULL, 0, xdr, - req->rq_bytes_sent, zerocopy, &sent); + transport->xmit.offset, + zerocopy, &sent); dprintk("RPC: xs_tcp_send_request(%u) = %d\n", - xdr->len - req->rq_bytes_sent, status); + xdr->len - transport->xmit.offset, status); /* If we've sent the entire packet, immediately * reset the count of bytes sent. */ - req->rq_bytes_sent += sent; - req->rq_xmit_bytes_sent += sent; + transport->xmit.offset += sent; + req->rq_bytes_sent = transport->xmit.offset; if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_xmit_bytes_sent += transport->xmit.offset; req->rq_bytes_sent = 0; + transport->xmit.offset = 0; return 0; } @@ -732,7 +1117,7 @@ static int xs_tcp_send_request(struct rpc_task *task) /* Should we call xs_close() here? */ break; case -EAGAIN: - status = xs_nospace(task); + status = xs_nospace(req); break; case -ECONNRESET: case -ECONNREFUSED: @@ -749,35 +1134,6 @@ static int xs_tcp_send_request(struct rpc_task *task) return status; } -/** - * xs_tcp_release_xprt - clean up after a tcp transmission - * @xprt: transport - * @task: rpc task - * - * This cleans up if an error causes us to abort the transmission of a request. - * In this case, the socket may need to be reset in order to avoid confusing - * the server. - */ -static void xs_tcp_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) -{ - struct rpc_rqst *req; - - if (task != xprt->snd_task) - return; - if (task == NULL) - goto out_release; - req = task->tk_rqstp; - if (req == NULL) - goto out_release; - if (req->rq_bytes_sent == 0) - goto out_release; - if (req->rq_bytes_sent == req->rq_snd_buf.len) - goto out_release; - set_bit(XPRT_CLOSE_WAIT, &xprt->state); -out_release: - xprt_release_xprt(xprt, task); -} - static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk) { transport->old_data_ready = sk->sk_data_ready; @@ -921,114 +1277,6 @@ static void xs_destroy(struct rpc_xprt *xprt) module_put(THIS_MODULE); } -static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) -{ - struct xdr_skb_reader desc = { - .skb = skb, - .offset = sizeof(rpc_fraghdr), - .count = skb->len - sizeof(rpc_fraghdr), - }; - - if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0) - return -1; - if (desc.count) - return -1; - return 0; -} - -/** - * xs_local_data_read_skb - * @xprt: transport - * @sk: socket - * @skb: skbuff - * - * Currently this assumes we can read the whole reply in a single gulp. - */ -static void xs_local_data_read_skb(struct rpc_xprt *xprt, - struct sock *sk, - struct sk_buff *skb) -{ - struct rpc_task *task; - struct rpc_rqst *rovr; - int repsize, copied; - u32 _xid; - __be32 *xp; - - repsize = skb->len - sizeof(rpc_fraghdr); - if (repsize < 4) { - dprintk("RPC: impossible RPC reply size %d\n", repsize); - return; - } - - /* Copy the XID from the skb... */ - xp = skb_header_pointer(skb, sizeof(rpc_fraghdr), sizeof(_xid), &_xid); - if (xp == NULL) - return; - - /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->recv_lock); - rovr = xprt_lookup_rqst(xprt, *xp); - if (!rovr) - goto out_unlock; - xprt_pin_rqst(rovr); - spin_unlock(&xprt->recv_lock); - task = rovr->rq_task; - - copied = rovr->rq_private_buf.buflen; - if (copied > repsize) - copied = repsize; - - if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) { - dprintk("RPC: sk_buff copy failed\n"); - spin_lock(&xprt->recv_lock); - goto out_unpin; - } - - spin_lock(&xprt->recv_lock); - xprt_complete_rqst(task, copied); -out_unpin: - xprt_unpin_rqst(rovr); - out_unlock: - spin_unlock(&xprt->recv_lock); -} - -static void xs_local_data_receive(struct sock_xprt *transport) -{ - struct sk_buff *skb; - struct sock *sk; - int err; - -restart: - mutex_lock(&transport->recv_mutex); - sk = transport->inet; - if (sk == NULL) - goto out; - for (;;) { - skb = skb_recv_datagram(sk, 0, 1, &err); - if (skb != NULL) { - xs_local_data_read_skb(&transport->xprt, sk, skb); - skb_free_datagram(sk, skb); - continue; - } - if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) - break; - if (need_resched()) { - mutex_unlock(&transport->recv_mutex); - cond_resched(); - goto restart; - } - } -out: - mutex_unlock(&transport->recv_mutex); -} - -static void xs_local_data_receive_workfn(struct work_struct *work) -{ - struct sock_xprt *transport = - container_of(work, struct sock_xprt, recv_worker); - xs_local_data_receive(transport); -} - /** * xs_udp_data_read_skb - receive callback for UDP sockets * @xprt: transport @@ -1058,13 +1306,13 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, return; /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; xprt_pin_rqst(rovr); xprt_update_rtt(rovr->rq_task); - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); task = rovr->rq_task; if ((copied = rovr->rq_private_buf.buflen) > repsize) @@ -1072,7 +1320,7 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, /* Suck it into the iovec, verify checksum if not done by hw. */ if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) { - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); __UDPX_INC_STATS(sk, UDP_MIB_INERRORS); goto out_unpin; } @@ -1081,13 +1329,13 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, spin_lock_bh(&xprt->transport_lock); xprt_adjust_cwnd(xprt, task, copied); spin_unlock_bh(&xprt->transport_lock); - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); xprt_complete_rqst(task, copied); __UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS); out_unpin: xprt_unpin_rqst(rovr); out_unlock: - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); } static void xs_udp_data_receive(struct sock_xprt *transport) @@ -1096,25 +1344,18 @@ static void xs_udp_data_receive(struct sock_xprt *transport) struct sock *sk; int err; -restart: mutex_lock(&transport->recv_mutex); sk = transport->inet; if (sk == NULL) goto out; + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); for (;;) { skb = skb_recv_udp(sk, 0, 1, &err); - if (skb != NULL) { - xs_udp_data_read_skb(&transport->xprt, sk, skb); - consume_skb(skb); - continue; - } - if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) + if (skb == NULL) break; - if (need_resched()) { - mutex_unlock(&transport->recv_mutex); - cond_resched(); - goto restart; - } + xs_udp_data_read_skb(&transport->xprt, sk, skb); + consume_skb(skb); + cond_resched(); } out: mutex_unlock(&transport->recv_mutex); @@ -1163,263 +1404,7 @@ static void xs_tcp_force_close(struct rpc_xprt *xprt) xprt_force_disconnect(xprt); } -static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - size_t len, used; - char *p; - - p = ((char *) &transport->tcp_fraghdr) + transport->tcp_offset; - len = sizeof(transport->tcp_fraghdr) - transport->tcp_offset; - used = xdr_skb_read_bits(desc, p, len); - transport->tcp_offset += used; - if (used != len) - return; - - transport->tcp_reclen = ntohl(transport->tcp_fraghdr); - if (transport->tcp_reclen & RPC_LAST_STREAM_FRAGMENT) - transport->tcp_flags |= TCP_RCV_LAST_FRAG; - else - transport->tcp_flags &= ~TCP_RCV_LAST_FRAG; - transport->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK; - - transport->tcp_flags &= ~TCP_RCV_COPY_FRAGHDR; - transport->tcp_offset = 0; - - /* Sanity check of the record length */ - if (unlikely(transport->tcp_reclen < 8)) { - dprintk("RPC: invalid TCP record fragment length\n"); - xs_tcp_force_close(xprt); - return; - } - dprintk("RPC: reading TCP record fragment of length %d\n", - transport->tcp_reclen); -} - -static void xs_tcp_check_fraghdr(struct sock_xprt *transport) -{ - if (transport->tcp_offset == transport->tcp_reclen) { - transport->tcp_flags |= TCP_RCV_COPY_FRAGHDR; - transport->tcp_offset = 0; - if (transport->tcp_flags & TCP_RCV_LAST_FRAG) { - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - transport->tcp_flags |= TCP_RCV_COPY_XID; - transport->tcp_copied = 0; - } - } -} - -static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_reader *desc) -{ - size_t len, used; - char *p; - - len = sizeof(transport->tcp_xid) - transport->tcp_offset; - dprintk("RPC: reading XID (%zu bytes)\n", len); - p = ((char *) &transport->tcp_xid) + transport->tcp_offset; - used = xdr_skb_read_bits(desc, p, len); - transport->tcp_offset += used; - if (used != len) - return; - transport->tcp_flags &= ~TCP_RCV_COPY_XID; - transport->tcp_flags |= TCP_RCV_READ_CALLDIR; - transport->tcp_copied = 4; - dprintk("RPC: reading %s XID %08x\n", - (transport->tcp_flags & TCP_RPC_REPLY) ? "reply for" - : "request with", - ntohl(transport->tcp_xid)); - xs_tcp_check_fraghdr(transport); -} - -static inline void xs_tcp_read_calldir(struct sock_xprt *transport, - struct xdr_skb_reader *desc) -{ - size_t len, used; - u32 offset; - char *p; - - /* - * We want transport->tcp_offset to be 8 at the end of this routine - * (4 bytes for the xid and 4 bytes for the call/reply flag). - * When this function is called for the first time, - * transport->tcp_offset is 4 (after having already read the xid). - */ - offset = transport->tcp_offset - sizeof(transport->tcp_xid); - len = sizeof(transport->tcp_calldir) - offset; - dprintk("RPC: reading CALL/REPLY flag (%zu bytes)\n", len); - p = ((char *) &transport->tcp_calldir) + offset; - used = xdr_skb_read_bits(desc, p, len); - transport->tcp_offset += used; - if (used != len) - return; - transport->tcp_flags &= ~TCP_RCV_READ_CALLDIR; - /* - * We don't yet have the XDR buffer, so we will write the calldir - * out after we get the buffer from the 'struct rpc_rqst' - */ - switch (ntohl(transport->tcp_calldir)) { - case RPC_REPLY: - transport->tcp_flags |= TCP_RCV_COPY_CALLDIR; - transport->tcp_flags |= TCP_RCV_COPY_DATA; - transport->tcp_flags |= TCP_RPC_REPLY; - break; - case RPC_CALL: - transport->tcp_flags |= TCP_RCV_COPY_CALLDIR; - transport->tcp_flags |= TCP_RCV_COPY_DATA; - transport->tcp_flags &= ~TCP_RPC_REPLY; - break; - default: - dprintk("RPC: invalid request message type\n"); - xs_tcp_force_close(&transport->xprt); - } - xs_tcp_check_fraghdr(transport); -} - -static inline void xs_tcp_read_common(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc, - struct rpc_rqst *req) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - struct xdr_buf *rcvbuf; - size_t len; - ssize_t r; - - rcvbuf = &req->rq_private_buf; - - if (transport->tcp_flags & TCP_RCV_COPY_CALLDIR) { - /* - * Save the RPC direction in the XDR buffer - */ - memcpy(rcvbuf->head[0].iov_base + transport->tcp_copied, - &transport->tcp_calldir, - sizeof(transport->tcp_calldir)); - transport->tcp_copied += sizeof(transport->tcp_calldir); - transport->tcp_flags &= ~TCP_RCV_COPY_CALLDIR; - } - - len = desc->count; - if (len > transport->tcp_reclen - transport->tcp_offset) - desc->count = transport->tcp_reclen - transport->tcp_offset; - r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, - desc, xdr_skb_read_bits); - - if (desc->count) { - /* Error when copying to the receive buffer, - * usually because we weren't able to allocate - * additional buffer pages. All we can do now - * is turn off TCP_RCV_COPY_DATA, so the request - * will not receive any additional updates, - * and time out. - * Any remaining data from this record will - * be discarded. - */ - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - dprintk("RPC: XID %08x truncated request\n", - ntohl(transport->tcp_xid)); - dprintk("RPC: xprt = %p, tcp_copied = %lu, " - "tcp_offset = %u, tcp_reclen = %u\n", - xprt, transport->tcp_copied, - transport->tcp_offset, transport->tcp_reclen); - return; - } - - transport->tcp_copied += r; - transport->tcp_offset += r; - desc->count = len - r; - - dprintk("RPC: XID %08x read %zd bytes\n", - ntohl(transport->tcp_xid), r); - dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, " - "tcp_reclen = %u\n", xprt, transport->tcp_copied, - transport->tcp_offset, transport->tcp_reclen); - - if (transport->tcp_copied == req->rq_private_buf.buflen) - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - else if (transport->tcp_offset == transport->tcp_reclen) { - if (transport->tcp_flags & TCP_RCV_LAST_FRAG) - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - } -} - -/* - * Finds the request corresponding to the RPC xid and invokes the common - * tcp read code to read the data. - */ -static inline int xs_tcp_read_reply(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - struct rpc_rqst *req; - - dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid)); - - /* Find and lock the request corresponding to this xid */ - spin_lock(&xprt->recv_lock); - req = xprt_lookup_rqst(xprt, transport->tcp_xid); - if (!req) { - dprintk("RPC: XID %08x request not found!\n", - ntohl(transport->tcp_xid)); - spin_unlock(&xprt->recv_lock); - return -1; - } - xprt_pin_rqst(req); - spin_unlock(&xprt->recv_lock); - - xs_tcp_read_common(xprt, desc, req); - - spin_lock(&xprt->recv_lock); - if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) - xprt_complete_rqst(req->rq_task, transport->tcp_copied); - xprt_unpin_rqst(req); - spin_unlock(&xprt->recv_lock); - return 0; -} - #if defined(CONFIG_SUNRPC_BACKCHANNEL) -/* - * Obtains an rpc_rqst previously allocated and invokes the common - * tcp read code to read the data. The result is placed in the callback - * queue. - * If we're unable to obtain the rpc_rqst we schedule the closing of the - * connection and return -1. - */ -static int xs_tcp_read_callback(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - struct rpc_rqst *req; - - /* Look up the request corresponding to the given XID */ - req = xprt_lookup_bc_request(xprt, transport->tcp_xid); - if (req == NULL) { - printk(KERN_WARNING "Callback slot table overflowed\n"); - xprt_force_disconnect(xprt); - return -1; - } - - dprintk("RPC: read callback XID %08x\n", ntohl(req->rq_xid)); - xs_tcp_read_common(xprt, desc, req); - - if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) - xprt_complete_bc_request(req, transport->tcp_copied); - - return 0; -} - -static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - - return (transport->tcp_flags & TCP_RPC_REPLY) ? - xs_tcp_read_reply(xprt, desc) : - xs_tcp_read_callback(xprt, desc); -} - static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net) { int ret; @@ -1435,145 +1420,8 @@ static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt) { return PAGE_SIZE; } -#else -static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - return xs_tcp_read_reply(xprt, desc); -} #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -/* - * Read data off the transport. This can be either an RPC_CALL or an - * RPC_REPLY. Relay the processing to helper functions. - */ -static void xs_tcp_read_data(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - - if (_xs_tcp_read_data(xprt, desc) == 0) - xs_tcp_check_fraghdr(transport); - else { - /* - * The transport_lock protects the request handling. - * There's no need to hold it to update the tcp_flags. - */ - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - } -} - -static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_skb_reader *desc) -{ - size_t len; - - len = transport->tcp_reclen - transport->tcp_offset; - if (len > desc->count) - len = desc->count; - desc->count -= len; - desc->offset += len; - transport->tcp_offset += len; - dprintk("RPC: discarded %zu bytes\n", len); - xs_tcp_check_fraghdr(transport); -} - -static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) -{ - struct rpc_xprt *xprt = rd_desc->arg.data; - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - struct xdr_skb_reader desc = { - .skb = skb, - .offset = offset, - .count = len, - }; - size_t ret; - - dprintk("RPC: xs_tcp_data_recv started\n"); - do { - trace_xs_tcp_data_recv(transport); - /* Read in a new fragment marker if necessary */ - /* Can we ever really expect to get completely empty fragments? */ - if (transport->tcp_flags & TCP_RCV_COPY_FRAGHDR) { - xs_tcp_read_fraghdr(xprt, &desc); - continue; - } - /* Read in the xid if necessary */ - if (transport->tcp_flags & TCP_RCV_COPY_XID) { - xs_tcp_read_xid(transport, &desc); - continue; - } - /* Read in the call/reply flag */ - if (transport->tcp_flags & TCP_RCV_READ_CALLDIR) { - xs_tcp_read_calldir(transport, &desc); - continue; - } - /* Read in the request data */ - if (transport->tcp_flags & TCP_RCV_COPY_DATA) { - xs_tcp_read_data(xprt, &desc); - continue; - } - /* Skip over any trailing bytes on short reads */ - xs_tcp_read_discard(transport, &desc); - } while (desc.count); - ret = len - desc.count; - if (ret < rd_desc->count) - rd_desc->count -= ret; - else - rd_desc->count = 0; - trace_xs_tcp_data_recv(transport); - dprintk("RPC: xs_tcp_data_recv done\n"); - return ret; -} - -static void xs_tcp_data_receive(struct sock_xprt *transport) -{ - struct rpc_xprt *xprt = &transport->xprt; - struct sock *sk; - read_descriptor_t rd_desc = { - .arg.data = xprt, - }; - unsigned long total = 0; - int read = 0; - -restart: - mutex_lock(&transport->recv_mutex); - sk = transport->inet; - if (sk == NULL) - goto out; - - /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ - for (;;) { - rd_desc.count = RPC_TCP_READ_CHUNK_SZ; - lock_sock(sk); - read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); - if (rd_desc.count != 0 || read < 0) { - clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); - release_sock(sk); - break; - } - release_sock(sk); - total += read; - if (need_resched()) { - mutex_unlock(&transport->recv_mutex); - cond_resched(); - goto restart; - } - } - if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) - queue_work(xprtiod_workqueue, &transport->recv_worker); -out: - mutex_unlock(&transport->recv_mutex); - trace_xs_tcp_data_ready(xprt, read, total); -} - -static void xs_tcp_data_receive_workfn(struct work_struct *work) -{ - struct sock_xprt *transport = - container_of(work, struct sock_xprt, recv_worker); - xs_tcp_data_receive(transport); -} - /** * xs_tcp_state_change - callback to handle TCP socket state changes * @sk: socket whose state has changed @@ -1600,17 +1448,13 @@ static void xs_tcp_state_change(struct sock *sk) case TCP_ESTABLISHED: spin_lock(&xprt->transport_lock); if (!xprt_test_and_set_connected(xprt)) { - - /* Reset TCP record info */ - transport->tcp_offset = 0; - transport->tcp_reclen = 0; - transport->tcp_copied = 0; - transport->tcp_flags = - TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID; xprt->connect_cookie++; clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); xprt_clear_connecting(xprt); + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; xprt_wake_pending_tasks(xprt, -EAGAIN); } spin_unlock(&xprt->transport_lock); @@ -1675,7 +1519,8 @@ static void xs_write_space(struct sock *sk) if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0) goto out; - xprt_write_space(xprt); + if (xprt_write_space(xprt)) + sk->sk_write_pending--; out: rcu_read_unlock(); } @@ -1773,11 +1618,17 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task) spin_unlock_bh(&xprt->transport_lock); } -static unsigned short xs_get_random_port(void) +static int xs_get_random_port(void) { - unsigned short range = xprt_max_resvport - xprt_min_resvport + 1; - unsigned short rand = (unsigned short) prandom_u32() % range; - return rand + xprt_min_resvport; + unsigned short min = xprt_min_resvport, max = xprt_max_resvport; + unsigned short range; + unsigned short rand; + + if (max < min) + return -EADDRINUSE; + range = max - min + 1; + rand = (unsigned short) prandom_u32() % range; + return rand + min; } /** @@ -1833,9 +1684,9 @@ static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock) transport->srcport = xs_sock_getport(sock); } -static unsigned short xs_get_srcport(struct sock_xprt *transport) +static int xs_get_srcport(struct sock_xprt *transport) { - unsigned short port = transport->srcport; + int port = transport->srcport; if (port == 0 && transport->xprt.resvport) port = xs_get_random_port(); @@ -1856,7 +1707,7 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) { struct sockaddr_storage myaddr; int err, nloop = 0; - unsigned short port = xs_get_srcport(transport); + int port = xs_get_srcport(transport); unsigned short last; /* @@ -1874,8 +1725,8 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) * transport->xprt.resvport == 1) xs_get_srcport above will * ensure that port is non-zero and we will bind as needed. */ - if (port == 0) - return 0; + if (port <= 0) + return port; memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen); do { @@ -2028,9 +1879,8 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, write_unlock_bh(&sk->sk_callback_lock); } - /* Tell the socket layer to start connecting... */ - xprt->stat.connect_count++; - xprt->stat.connect_start = jiffies; + xs_stream_reset_connect(transport); + return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0); } @@ -2062,6 +1912,9 @@ static int xs_local_setup_socket(struct sock_xprt *transport) case 0: dprintk("RPC: xprt %p connected to %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; xprt_set_connected(xprt); case -ENOBUFS: break; @@ -2386,9 +2239,10 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) xs_set_memalloc(xprt); + /* Reset TCP record info */ + xs_stream_reset_connect(transport); + /* Tell the socket layer to start connecting... */ - xprt->stat.connect_count++; - xprt->stat.connect_start = jiffies; set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); switch (ret) { @@ -2561,7 +2415,7 @@ static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) "%llu %llu %lu %llu %llu\n", xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, @@ -2616,7 +2470,7 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) transport->srcport, xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, @@ -2704,9 +2558,8 @@ static int bc_sendto(struct rpc_rqst *req) /* * The send routine. Borrows from svc_send */ -static int bc_send_request(struct rpc_task *task) +static int bc_send_request(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct svc_xprt *xprt; int len; @@ -2720,12 +2573,7 @@ static int bc_send_request(struct rpc_task *task) * Grab the mutex to serialize data as the connection is shared * with the fore channel */ - if (!mutex_trylock(&xprt->xpt_mutex)) { - rpc_sleep_on(&xprt->xpt_bc_pending, task, NULL); - if (!mutex_trylock(&xprt->xpt_mutex)) - return -EAGAIN; - rpc_wake_up_queued_task(&xprt->xpt_bc_pending, task); - } + mutex_lock(&xprt->xpt_mutex); if (test_bit(XPT_DEAD, &xprt->xpt_flags)) len = -ENOTCONN; else @@ -2761,7 +2609,7 @@ static void bc_destroy(struct rpc_xprt *xprt) static const struct rpc_xprt_ops xs_local_ops = { .reserve_xprt = xprt_reserve_xprt, - .release_xprt = xs_tcp_release_xprt, + .release_xprt = xprt_release_xprt, .alloc_slot = xprt_alloc_slot, .free_slot = xprt_free_slot, .rpcbind = xs_local_rpcbind, @@ -2769,6 +2617,7 @@ static const struct rpc_xprt_ops xs_local_ops = { .connect = xs_local_connect, .buf_alloc = rpc_malloc, .buf_free = rpc_free, + .prepare_request = xs_stream_prepare_request, .send_request = xs_local_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_close, @@ -2803,14 +2652,15 @@ static const struct rpc_xprt_ops xs_udp_ops = { static const struct rpc_xprt_ops xs_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, - .release_xprt = xs_tcp_release_xprt, - .alloc_slot = xprt_lock_and_alloc_slot, + .release_xprt = xprt_release_xprt, + .alloc_slot = xprt_alloc_slot, .free_slot = xprt_free_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, .buf_alloc = rpc_malloc, .buf_free = rpc_free, + .prepare_request = xs_stream_prepare_request, .send_request = xs_tcp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_tcp_shutdown, @@ -2952,9 +2802,8 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) xprt->ops = &xs_local_ops; xprt->timeout = &xs_local_default_timeout; - INIT_WORK(&transport->recv_worker, xs_local_data_receive_workfn); - INIT_DELAYED_WORK(&transport->connect_worker, - xs_dummy_setup_socket); + INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); + INIT_DELAYED_WORK(&transport->connect_worker, xs_dummy_setup_socket); switch (sun->sun_family) { case AF_LOCAL: @@ -3106,7 +2955,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) xprt->connect_timeout = xprt->timeout->to_initval * (xprt->timeout->to_retries + 1); - INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn); + INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket); switch (addr->sa_family) { @@ -3317,12 +3166,8 @@ static int param_set_uint_minmax(const char *val, static int param_set_portnr(const char *val, const struct kernel_param *kp) { - if (kp->arg == &xprt_min_resvport) - return param_set_uint_minmax(val, kp, - RPC_MIN_RESVPORT, - xprt_max_resvport); return param_set_uint_minmax(val, kp, - xprt_min_resvport, + RPC_MIN_RESVPORT, RPC_MAX_RESVPORT); }