Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Jun 2014 17:30:18 +0000 (10:30 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Jun 2014 17:30:18 +0000 (10:30 -0700)
Pull vfs updates from Al Viro:
 "This the bunch that sat in -next + lock_parent() fix.  This is the
  minimal set; there's more pending stuff.

  In particular, I really hope to get acct.c fixes merged this cycle -
  we need that to deal sanely with delayed-mntput stuff.  In the next
  pile, hopefully - that series is fairly short and localized
  (kernel/acct.c, fs/super.c and fs/namespace.c).  In this pile: more
  iov_iter work.  Most of prereqs for ->splice_write with sane locking
  order are there and Kent's dio rewrite would also fit nicely on top of
  this pile"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (70 commits)
  lock_parent: don't step on stale ->d_parent of all-but-freed one
  kill generic_file_splice_write()
  ceph: switch to iter_file_splice_write()
  shmem: switch to iter_file_splice_write()
  nfs: switch to iter_splice_write_file()
  fs/splice.c: remove unneeded exports
  ocfs2: switch to iter_file_splice_write()
  ->splice_write() via ->write_iter()
  bio_vec-backed iov_iter
  optimize copy_page_{to,from}_iter()
  bury generic_file_aio_{read,write}
  lustre: get rid of messing with iovecs
  ceph: switch to ->write_iter()
  ceph_sync_direct_write: stop poking into iov_iter guts
  ceph_sync_read: stop poking into iov_iter guts
  new helper: copy_page_from_iter()
  fuse: switch to ->write_iter()
  btrfs: switch to ->write_iter()
  ocfs2: switch to ->write_iter()
  xfs: switch to ->write_iter()
  ...

47 files changed:
1  2 
drivers/staging/lustre/lustre/lclient/lcommon_cl.c
drivers/staging/lustre/lustre/llite/file.c
drivers/staging/lustre/lustre/llite/llite_internal.h
drivers/staging/lustre/lustre/llite/rw.c
drivers/staging/lustre/lustre/llite/rw26.c
drivers/staging/lustre/lustre/llite/vvp_io.c
drivers/usb/gadget/storage_common.c
fs/9p/vfs_file.c
fs/affs/file.c
fs/block_dev.c
fs/btrfs/file.c
fs/btrfs/inode.c
fs/ceph/addr.c
fs/cifs/cifsfs.c
fs/cifs/cifsfs.h
fs/cifs/file.c
fs/dcache.c
fs/ext3/inode.c
fs/ext4/ext4.h
fs/ext4/file.c
fs/ext4/inode.c
fs/f2fs/data.c
fs/f2fs/file.c
fs/fat/inode.c
fs/file_table.c
fs/fuse/file.c
fs/gfs2/aops.c
fs/gfs2/file.c
fs/nfs/direct.c
fs/nfs/file.c
fs/nfs/internal.h
fs/nfs/nfs4file.c
fs/ntfs/file.c
fs/ocfs2/file.c
fs/reiserfs/file.c
fs/reiserfs/inode.c
fs/ubifs/file.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_file.c
fs/xfs/xfs_trace.h
include/linux/blk_types.h
include/linux/fs.h
include/linux/nfs_fs.h
mm/filemap.c
mm/page_io.c
mm/shmem.c
mm/vmscan.c

index dc24cfa5803722dd86669102189c27d793b83ecf,a07d5156bc50c017f8d942b090c7ce901735bb94..1b0c216bc5687742198c89c8d6f21b07036a11bb
@@@ -63,7 -63,7 +63,7 @@@
  
  #include "../llite/llite_internal.h"
  
 -const struct cl_req_operations ccc_req_ops;
 +static const struct cl_req_operations ccc_req_ops;
  
  /*
   * ccc_ prefix stands for "Common Client Code".
@@@ -112,11 -112,12 +112,11 @@@ static struct lu_kmem_descr ccc_caches[
   *
   */
  
 -void *ccc_key_init(const struct lu_context *ctx,
 -                        struct lu_context_key *key)
 +void *ccc_key_init(const struct lu_context *ctx, struct lu_context_key *key)
  {
        struct ccc_thread_info *info;
  
 -      OBD_SLAB_ALLOC_PTR_GFP(info, ccc_thread_kmem, __GFP_IO);
 +      OBD_SLAB_ALLOC_PTR_GFP(info, ccc_thread_kmem, GFP_NOFS);
        if (info == NULL)
                info = ERR_PTR(-ENOMEM);
        return info;
@@@ -134,7 -135,7 +134,7 @@@ void *ccc_session_key_init(const struc
  {
        struct ccc_session *session;
  
 -      OBD_SLAB_ALLOC_PTR_GFP(session, ccc_session_kmem, __GFP_IO);
 +      OBD_SLAB_ALLOC_PTR_GFP(session, ccc_session_kmem, GFP_NOFS);
        if (session == NULL)
                session = ERR_PTR(-ENOMEM);
        return session;
@@@ -250,7 -251,7 +250,7 @@@ int ccc_req_init(const struct lu_env *e
        struct ccc_req *vrq;
        int result;
  
 -      OBD_SLAB_ALLOC_PTR_GFP(vrq, ccc_req_kmem, __GFP_IO);
 +      OBD_SLAB_ALLOC_PTR_GFP(vrq, ccc_req_kmem, GFP_NOFS);
        if (vrq != NULL) {
                cl_req_slice_add(req, &vrq->crq_cl, dev, &ccc_req_ops);
                result = 0;
@@@ -326,7 -327,7 +326,7 @@@ struct lu_object *ccc_object_alloc(cons
        struct ccc_object *vob;
        struct lu_object  *obj;
  
 -      OBD_SLAB_ALLOC_PTR_GFP(vob, ccc_object_kmem, __GFP_IO);
 +      OBD_SLAB_ALLOC_PTR_GFP(vob, ccc_object_kmem, GFP_NOFS);
        if (vob != NULL) {
                struct cl_object_header *hdr;
  
@@@ -395,7 -396,7 +395,7 @@@ int ccc_lock_init(const struct lu_env *
  
        CLOBINVRNT(env, obj, ccc_object_invariant(obj));
  
 -      OBD_SLAB_ALLOC_PTR_GFP(clk, ccc_lock_kmem, __GFP_IO);
 +      OBD_SLAB_ALLOC_PTR_GFP(clk, ccc_lock_kmem, GFP_NOFS);
        if (clk != NULL) {
                cl_lock_slice_add(lock, &clk->clk_cl, obj, lkops);
                result = 0;
@@@ -720,31 -721,12 +720,12 @@@ int ccc_io_one_lock_index(const struct 
  void ccc_io_update_iov(const struct lu_env *env,
                       struct ccc_io *cio, struct cl_io *io)
  {
-       int i;
        size_t size = io->u.ci_rw.crw_count;
  
-       cio->cui_iov_olen = 0;
-       if (!cl_is_normalio(env, io) || cio->cui_tot_nrsegs == 0)
+       if (!cl_is_normalio(env, io) || cio->cui_iter == NULL)
                return;
  
-       for (i = 0; i < cio->cui_tot_nrsegs; i++) {
-               struct iovec *iv = &cio->cui_iov[i];
-               if (iv->iov_len < size)
-                       size -= iv->iov_len;
-               else {
-                       if (iv->iov_len > size) {
-                               cio->cui_iov_olen = iv->iov_len;
-                               iv->iov_len = size;
-                       }
-                       break;
-               }
-       }
-       cio->cui_nrsegs = i + 1;
-       LASSERTF(cio->cui_tot_nrsegs >= cio->cui_nrsegs,
-                "tot_nrsegs: %lu, nrsegs: %lu\n",
-                cio->cui_tot_nrsegs, cio->cui_nrsegs);
+       iov_iter_truncate(cio->cui_iter, size);
  }
  
  int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
@@@ -775,30 -757,7 +756,7 @@@ void ccc_io_advance(const struct lu_en
        if (!cl_is_normalio(env, io))
                return;
  
-       LASSERT(cio->cui_tot_nrsegs >= cio->cui_nrsegs);
-       LASSERT(cio->cui_tot_count  >= nob);
-       cio->cui_iov    += cio->cui_nrsegs;
-       cio->cui_tot_nrsegs -= cio->cui_nrsegs;
-       cio->cui_tot_count  -= nob;
-       /* update the iov */
-       if (cio->cui_iov_olen > 0) {
-               struct iovec *iv;
-               cio->cui_iov--;
-               cio->cui_tot_nrsegs++;
-               iv = &cio->cui_iov[0];
-               if (io->ci_continue) {
-                       iv->iov_base += iv->iov_len;
-                       LASSERT(cio->cui_iov_olen > iv->iov_len);
-                       iv->iov_len = cio->cui_iov_olen - iv->iov_len;
-               } else {
-                       /* restore the iov_len, in case of restart io. */
-                       iv->iov_len = cio->cui_iov_olen;
-               }
-               cio->cui_iov_olen = 0;
-       }
+       iov_iter_reexpand(cio->cui_iter, cio->cui_tot_count  -= nob);
  }
  
  /**
@@@ -962,7 -921,7 +920,7 @@@ void ccc_req_attr_set(const struct lu_e
               JOBSTATS_JOBID_SIZE);
  }
  
 -const struct cl_req_operations ccc_req_ops = {
 +static const struct cl_req_operations ccc_req_ops = {
        .cro_attr_set   = ccc_req_attr_set,
        .cro_completion = ccc_req_completion
  };
index c4ddec2b3589eb743475f022c29d47ce49dbbec8,3efda2540d295fb41595847419165811ee431652..716e1ee0104f6fe0c2c1323689f216acff807bdc
  
  #include "cl_object.h"
  
 -struct ll_file_data *ll_file_data_get(void)
 +static int
 +ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
 +
 +static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
 +                        bool *lease_broken);
 +
 +static enum llioc_iter
 +ll_iocontrol_call(struct inode *inode, struct file *file,
 +                unsigned int cmd, unsigned long arg, int *rcp);
 +
 +static struct ll_file_data *ll_file_data_get(void)
  {
        struct ll_file_data *fd;
  
 -      OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
 +      OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
        if (fd == NULL)
                return NULL;
        fd->fd_write_failed = false;
@@@ -257,8 -247,8 +257,8 @@@ int ll_md_real_close(struct inode *inod
        return rc;
  }
  
 -int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 -              struct file *file)
 +static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 +                     struct file *file)
  {
        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
        struct ll_inode_info *lli = ll_i2info(inode);
@@@ -492,8 -482,8 +492,8 @@@ static int ll_och_fill(struct obd_expor
        return md_set_open_replay_data(md_exp, och, it);
  }
  
 -int ll_local_open(struct file *file, struct lookup_intent *it,
 -                struct ll_file_data *fd, struct obd_client_handle *och)
 +static int ll_local_open(struct file *file, struct lookup_intent *it,
 +                       struct ll_file_data *fd, struct obd_client_handle *och)
  {
        struct inode *inode = file->f_dentry->d_inode;
        struct ll_inode_info *lli = ll_i2info(inode);
@@@ -743,9 -733,8 +743,9 @@@ static int ll_md_blocking_lease_ast(str
  /**
   * Acquire a lease and open the file.
   */
 -struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
 -                                      fmode_t fmode, __u64 open_flags)
 +static struct obd_client_handle *
 +ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 +            __u64 open_flags)
  {
        struct lookup_intent it = { .it_op = IT_OPEN };
        struct ll_sb_info *sbi = ll_i2sbi(inode);
@@@ -873,13 -862,14 +873,13 @@@ out
        OBD_FREE_PTR(och);
        return ERR_PTR(rc);
  }
 -EXPORT_SYMBOL(ll_lease_open);
  
  /**
   * Release lease and close the file.
   * It will check if the lease has ever broken.
   */
 -int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
 -                      bool *lease_broken)
 +static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
 +                        bool *lease_broken)
  {
        struct ldlm_lock *lock;
        bool cancelled = true;
                                       NULL);
        return rc;
  }
 -EXPORT_SYMBOL(ll_lease_close);
  
  /* Fills the obdo with the attributes for the lsm */
  static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
@@@ -1114,9 -1105,7 +1114,7 @@@ restart
  
                switch (vio->cui_io_subtype) {
                case IO_NORMAL:
-                       cio->cui_iov = args->u.normal.via_iov;
-                       cio->cui_nrsegs = args->u.normal.via_nrsegs;
-                       cio->cui_tot_nrsegs = cio->cui_nrsegs;
+                       cio->cui_iter = args->u.normal.via_iter;
                        cio->cui_iocb = args->u.normal.via_iocb;
                        if ((iot == CIT_WRITE) &&
                            !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
        return result;
  }
  
- static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-                               unsigned long nr_segs, loff_t pos)
+ static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
  {
        struct lu_env      *env;
        struct vvp_io_args *args;
-       size_t        count = 0;
        ssize_t      result;
        int              refcheck;
  
-       result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-       if (result)
-               return result;
        env = cl_env_get(&refcheck);
        if (IS_ERR(env))
                return PTR_ERR(env);
  
        args = vvp_env_args(env, IO_NORMAL);
-       args->u.normal.via_iov = (struct iovec *)iov;
-       args->u.normal.via_nrsegs = nr_segs;
+       args->u.normal.via_iter = to;
        args->u.normal.via_iocb = iocb;
  
        result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
-                                   &iocb->ki_pos, count);
-       cl_env_put(env, &refcheck);
-       return result;
- }
- static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
-                           loff_t *ppos)
- {
-       struct lu_env *env;
-       struct iovec  *local_iov;
-       struct kiocb  *kiocb;
-       ssize_t result;
-       int         refcheck;
-       env = cl_env_get(&refcheck);
-       if (IS_ERR(env))
-               return PTR_ERR(env);
-       local_iov = &vvp_env_info(env)->vti_local_iov;
-       kiocb = &vvp_env_info(env)->vti_kiocb;
-       local_iov->iov_base = (void __user *)buf;
-       local_iov->iov_len = count;
-       init_sync_kiocb(kiocb, file);
-       kiocb->ki_pos = *ppos;
-       kiocb->ki_nbytes = count;
-       result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
-       *ppos = kiocb->ki_pos;
+                                   &iocb->ki_pos, iov_iter_count(to));
        cl_env_put(env, &refcheck);
        return result;
  }
  /*
   * Write to a file (through the page cache).
   */
- static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                                unsigned long nr_segs, loff_t pos)
+ static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  {
        struct lu_env      *env;
        struct vvp_io_args *args;
-       size_t        count = 0;
        ssize_t      result;
        int              refcheck;
  
-       result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
-       if (result)
-               return result;
        env = cl_env_get(&refcheck);
        if (IS_ERR(env))
                return PTR_ERR(env);
  
        args = vvp_env_args(env, IO_NORMAL);
-       args->u.normal.via_iov = (struct iovec *)iov;
-       args->u.normal.via_nrsegs = nr_segs;
+       args->u.normal.via_iter = from;
        args->u.normal.via_iocb = iocb;
  
        result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
-                                 &iocb->ki_pos, count);
+                                 &iocb->ki_pos, iov_iter_count(from));
        cl_env_put(env, &refcheck);
        return result;
  }
  
- static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
-                            loff_t *ppos)
- {
-       struct lu_env *env;
-       struct iovec  *local_iov;
-       struct kiocb  *kiocb;
-       ssize_t result;
-       int         refcheck;
-       env = cl_env_get(&refcheck);
-       if (IS_ERR(env))
-               return PTR_ERR(env);
-       local_iov = &vvp_env_info(env)->vti_local_iov;
-       kiocb = &vvp_env_info(env)->vti_kiocb;
-       local_iov->iov_base = (void __user *)buf;
-       local_iov->iov_len = count;
-       init_sync_kiocb(kiocb, file);
-       kiocb->ki_pos = *ppos;
-       kiocb->ki_nbytes = count;
-       result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
-       *ppos = kiocb->ki_pos;
-       cl_env_put(env, &refcheck);
-       return result;
- }
  /*
   * Send file content (through pagecache) somewhere with helper
   */
@@@ -1449,7 -1366,7 +1375,7 @@@ int ll_lov_getstripe_ea_info(struct ino
        struct md_op_data *op_data;
        int rc, lmmsize;
  
 -      rc = ll_get_max_mdsize(sbi, &lmmsize);
 +      rc = ll_get_default_mdsize(sbi, &lmmsize);
        if (rc)
                return rc;
  
@@@ -1599,8 -1516,7 +1525,8 @@@ static int ll_lov_getstripe(struct inod
        return rc;
  }
  
 -int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
 +static int
 +ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
  {
        struct ll_inode_info   *lli = ll_i2info(inode);
        struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
   * Get size for inode for which FIEMAP mapping is requested.
   * Make the FIEMAP get_info call and returns the result.
   */
 -int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
 -            int num_bytes)
 +static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
 +                      size_t num_bytes)
  {
        struct obd_export *exp = ll_i2dtexp(inode);
        struct lov_stripe_md *lsm = NULL;
        struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
 -      int vallen = num_bytes;
 +      __u32 vallen = num_bytes;
        int rc;
  
        /* Checks for fiemap flags */
@@@ -1829,10 -1745,6 +1755,10 @@@ static int ll_ioctl_fiemap(struct inod
        if (get_user(extent_count,
            &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
                return -EFAULT;
 +
 +      if (extent_count >=
 +          (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
 +              return -EINVAL;
        num_bytes = sizeof(*fiemap_s) + (extent_count *
                                         sizeof(struct ll_fiemap_extent));
  
@@@ -2204,8 -2116,7 +2130,8 @@@ out
        return rc;
  }
  
 -long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 +static long
 +ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
  {
        struct inode            *inode = file->f_dentry->d_inode;
        struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
  }
  
  
 -loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
 +static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
  {
        struct inode *inode = file->f_dentry->d_inode;
        loff_t retval, eof = 0;
        return retval;
  }
  
 -int ll_flush(struct file *file, fl_owner_t id)
 +static int ll_flush(struct file *file, fl_owner_t id)
  {
        struct inode *inode = file->f_dentry->d_inode;
        struct ll_inode_info *lli = ll_i2info(inode);
  
  /**
   * Called to make sure a portion of file has been written out.
 - * if @local_only is not true, it will send OST_SYNC RPCs to ost.
 + * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
   *
   * Return how many pages have been written.
   */
@@@ -2668,10 -2579,11 +2594,10 @@@ int ll_fsync(struct file *file, loff_t 
        if (!err)
                ptlrpc_req_finished(req);
  
 -      if (datasync && S_ISREG(inode->i_mode)) {
 +      if (S_ISREG(inode->i_mode)) {
                struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
  
 -              err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
 -                              CL_FSYNC_ALL, 0);
 +              err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
                if (rc == 0 && err < 0)
                        rc = err;
                if (rc < 0)
        return rc;
  }
  
 -int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
 +static int
 +ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
  {
        struct inode *inode = file->f_dentry->d_inode;
        struct ll_sb_info *sbi = ll_i2sbi(inode);
  
        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
  
 -      if (file_lock->fl_flags & FL_FLOCK) {
 +      if (file_lock->fl_flags & FL_FLOCK)
                LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
 -              /* flocks are whole-file locks */
 -              flock.l_flock.end = OFFSET_MAX;
 -              /* For flocks owner is determined by the local file descriptor*/
 -              flock.l_flock.owner = (unsigned long)file_lock->fl_file;
 -      } else if (file_lock->fl_flags & FL_POSIX) {
 -              flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
 -              flock.l_flock.start = file_lock->fl_start;
 -              flock.l_flock.end = file_lock->fl_end;
 -      } else {
 +      else if (!(file_lock->fl_flags & FL_POSIX))
                return -EINVAL;
 -      }
 +
 +      flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
        flock.l_flock.pid = file_lock->fl_pid;
 +      flock.l_flock.start = file_lock->fl_start;
 +      flock.l_flock.end = file_lock->fl_end;
  
        /* Somewhat ugly workaround for svc lockd.
         * lockd installs custom fl_lmops->lm_compare_owner that checks
        return rc;
  }
  
 -int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
 +static int
 +ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
  {
        return -ENOSYS;
  }
@@@ -2893,16 -2808,16 +2819,16 @@@ static int ll_inode_revalidate_fini(str
                if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
                        return 0;
        } else if (rc != 0) {
 -              CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
 -                     ll_get_fsname(inode->i_sb, NULL, 0),
 -                     PFID(ll_inode2fid(inode)), rc);
 +              CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
 +                           "%s: revalidate FID "DFID" error: rc = %d\n",
 +                           ll_get_fsname(inode->i_sb, NULL, 0),
 +                           PFID(ll_inode2fid(inode)), rc);
        }
  
        return rc;
  }
  
 -int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
 -                           __u64 ibits)
 +static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
  {
        struct inode *inode = dentry->d_inode;
        struct ptlrpc_request *req = NULL;
                int ealen = 0;
  
                if (S_ISREG(inode->i_mode)) {
 -                      rc = ll_get_max_mdsize(sbi, &ealen);
 +                      rc = ll_get_default_mdsize(sbi, &ealen);
                        if (rc)
                                return rc;
                        valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
        return rc;
  }
  
 -int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
 -                         __u64 ibits)
 +static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
  {
        struct inode *inode = dentry->d_inode;
        int rc;
  
 -      rc = __ll_inode_revalidate_it(dentry, it, ibits);
 +      rc = __ll_inode_revalidate(dentry, ibits);
        if (rc != 0)
                return rc;
  
        return rc;
  }
  
 -int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
 -                struct lookup_intent *it, struct kstat *stat)
 +int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
  {
        struct inode *inode = de->d_inode;
        struct ll_sb_info *sbi = ll_i2sbi(inode);
        struct ll_inode_info *lli = ll_i2info(inode);
        int res = 0;
  
 -      res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
 -                                           MDS_INODELOCK_LOOKUP);
 +      res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
 +                                    MDS_INODELOCK_LOOKUP);
        ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
  
        if (res)
  
        return 0;
  }
 -int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
 -{
 -      struct lookup_intent it = { .it_op = IT_GETATTR };
 -
 -      return ll_getattr_it(mnt, de, &it, stat);
 -}
  
 -int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 -              __u64 start, __u64 len)
 +static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 +                   __u64 start, __u64 len)
  {
        int rc;
        size_t num_bytes;
        fiemap->fm_extent_count = fieinfo->fi_extents_max;
        fiemap->fm_start = start;
        fiemap->fm_length = len;
 -      memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
 -             sizeof(struct ll_fiemap_extent));
 +      if (extent_count > 0)
 +              memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
 +                     sizeof(struct ll_fiemap_extent));
  
        rc = ll_do_fiemap(inode, fiemap, num_bytes);
  
        fieinfo->fi_flags = fiemap->fm_flags;
        fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
 -      memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
 -             fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
 +      if (extent_count > 0)
 +              memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
 +                     fiemap->fm_mapped_extents *
 +                     sizeof(struct ll_fiemap_extent));
  
        OBD_FREE_LARGE(fiemap, num_bytes);
        return rc;
  }
  
 -struct posix_acl * ll_get_acl(struct inode *inode, int type)
 +struct posix_acl *ll_get_acl(struct inode *inode, int type)
  {
        struct ll_inode_info *lli = ll_i2info(inode);
        struct posix_acl *acl = NULL;
@@@ -3123,8 -3043,10 +3049,8 @@@ int ll_inode_permission(struct inode *i
        * need to do it before permission check. */
  
        if (inode == inode->i_sb->s_root->d_inode) {
 -              struct lookup_intent it = { .it_op = IT_LOOKUP };
 -
 -              rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
 -                                            MDS_INODELOCK_LOOKUP);
 +              rc = __ll_inode_revalidate(inode->i_sb->s_root,
 +                                         MDS_INODELOCK_LOOKUP);
                if (rc)
                        return rc;
        }
  
  /* -o localflock - only provides locally consistent flock locks */
  struct file_operations ll_file_operations = {
-       .read      = ll_file_read,
-       .aio_read = ll_file_aio_read,
-       .write    = ll_file_write,
-       .aio_write = ll_file_aio_write,
+       .read      = new_sync_read,
+       .read_iter = ll_file_read_iter,
+       .write    = new_sync_write,
+       .write_iter = ll_file_write_iter,
        .unlocked_ioctl = ll_file_ioctl,
        .open      = ll_file_open,
        .release        = ll_file_release,
  };
  
  struct file_operations ll_file_operations_flock = {
-       .read      = ll_file_read,
-       .aio_read    = ll_file_aio_read,
-       .write    = ll_file_write,
-       .aio_write   = ll_file_aio_write,
+       .read      = new_sync_read,
+       .read_iter    = ll_file_read_iter,
+       .write    = new_sync_write,
+       .write_iter   = ll_file_write_iter,
        .unlocked_ioctl = ll_file_ioctl,
        .open      = ll_file_open,
        .release        = ll_file_release,
  
  /* These are for -o noflock - to return ENOSYS on flock calls */
  struct file_operations ll_file_operations_noflock = {
-       .read      = ll_file_read,
-       .aio_read    = ll_file_aio_read,
-       .write    = ll_file_write,
-       .aio_write   = ll_file_aio_write,
+       .read      = new_sync_read,
+       .read_iter    = ll_file_read_iter,
+       .write    = new_sync_write,
+       .write_iter   = ll_file_write_iter,
        .unlocked_ioctl = ll_file_ioctl,
        .open      = ll_file_open,
        .release        = ll_file_release,
@@@ -3276,9 -3198,8 +3202,9 @@@ void ll_iocontrol_unregister(void *magi
  EXPORT_SYMBOL(ll_iocontrol_register);
  EXPORT_SYMBOL(ll_iocontrol_unregister);
  
 -enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
 -                      unsigned int cmd, unsigned long arg, int *rcp)
 +static enum llioc_iter
 +ll_iocontrol_call(struct inode *inode, struct file *file,
 +                unsigned int cmd, unsigned long arg, int *rcp)
  {
        enum llioc_iter ret = LLIOC_CONT;
        struct llioc_data *data;
@@@ -3363,7 -3284,7 +3289,7 @@@ static int ll_layout_fetch(struct inod
         * layout here. Please note that we can't use the LVB buffer in
         * completion AST because it doesn't have a large enough buffer */
        oc = ll_mdscapa_get(inode);
 -      rc = ll_get_max_mdsize(sbi, &lmmsize);
 +      rc = ll_get_default_mdsize(sbi, &lmmsize);
        if (rc == 0)
                rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
                                OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
                return rc;
  
        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 -      if (body == NULL || body->eadatasize > lmmsize)
 +      if (body == NULL)
                GOTO(out, rc = -EPROTO);
  
        lmmsize = body->eadatasize;
@@@ -3440,7 -3361,7 +3366,7 @@@ static int ll_layout_lock_set(struct lu
                if (lvb_ready) {
                        /* layout_gen must be valid if layout lock is not
                         * cancelled and stripe has already set */
 -                      *gen = lli->lli_layout_gen;
 +                      *gen = ll_layout_version_get(lli);
                        rc = 0;
                }
                GOTO(out, rc);
@@@ -3538,20 -3459,32 +3464,20 @@@ int ll_layout_refresh(struct inode *ino
        };
        int rc;
  
 -      *gen = lli->lli_layout_gen;
 -      if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
 +      *gen = ll_layout_version_get(lli);
 +      if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
                return 0;
  
        /* sanity checks */
        LASSERT(fid_is_sane(ll_inode2fid(inode)));
        LASSERT(S_ISREG(inode->i_mode));
  
 -      /* mostly layout lock is caching on the local side, so try to match
 -       * it before grabbing layout lock mutex. */
 -      mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
 -                             LCK_CR | LCK_CW | LCK_PR | LCK_PW);
 -      if (mode != 0) { /* hit cached lock */
 -              rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
 -              if (rc == 0)
 -                      return 0;
 -
 -              /* better hold lli_layout_mutex to try again otherwise
 -               * it will have starvation problem. */
 -      }
 -
        /* take layout lock mutex to enqueue layout lock exclusively. */
        mutex_lock(&lli->lli_layout_mutex);
  
  again:
 -      /* try again. Maybe somebody else has done this. */
 +      /* mostly layout lock is caching on the local side, so try to match
 +       * it before grabbing layout lock mutex. */
        mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
                               LCK_CR | LCK_CW | LCK_PR | LCK_PW);
        if (mode != 0) { /* hit cached lock */
index dde7632ba01fa8dab671fb43995723bba1f5d3dd,fbb8650ead346885c4f0dd14b5e3bdc7ead493b7..140ee947ba4949ea547ac03ebaf9a9efb9e51ab1
  #define LUSTRE_FPRIVATE(file) ((file)->private_data)
  
  struct ll_dentry_data {
 -      int                             lld_cwd_count;
 -      int                             lld_mnt_count;
 -      struct obd_client_handle        lld_cwd_och;
 -      struct obd_client_handle        lld_mnt_och;
        struct lookup_intent            *lld_it;
        unsigned int                    lld_sa_generation;
        unsigned int                    lld_invalid:1;
@@@ -79,6 -83,8 +79,6 @@@
  
  #define ll_d2d(de) ((struct ll_dentry_data*)((de)->d_fsdata))
  
 -extern struct file_operations ll_pgcache_seq_fops;
 -
  #define LLI_INODE_MAGIC                0x111d0de5
  #define LLI_INODE_DEAD                  0xdeadd00d
  
@@@ -114,12 -120,16 +114,12 @@@ enum lli_flags 
        /* Sizeon-on-MDS attributes are changed. An attribute update needs to
         * be sent to MDS. */
        LLIF_SOM_DIRTY    = (1 << 3),
 -      /* File is contented */
 -      LLIF_CONTENDED    = (1 << 4),
 -      /* Truncate uses server lock for this file */
 -      LLIF_SRVLOCK        = (1 << 5),
        /* File data is modified. */
 -      LLIF_DATA_MODIFIED      = (1 << 6),
 +      LLIF_DATA_MODIFIED      = (1 << 4),
        /* File is being restored */
 -      LLIF_FILE_RESTORING     = (1 << 7),
 +      LLIF_FILE_RESTORING     = (1 << 5),
        /* Xattr cache is attached to the file */
 -      LLIF_XATTR_CACHE        = (1 << 8),
 +      LLIF_XATTR_CACHE        = (1 << 6),
  };
  
  struct ll_inode_info {
                         * cleanup the dir readahead. */
                        void                       *d_opendir_key;
                        struct ll_statahead_info       *d_sai;
 -                      struct posix_acl               *d_def_acl;
                        /* protect statahead stuff. */
                        spinlock_t                      d_sa_lock;
                        /* "opendir_pid" is the token when lookup/revalid
  #define lli_readdir_mutex       u.d.d_readdir_mutex
  #define lli_opendir_key        u.d.d_opendir_key
  #define lli_sai                u.d.d_sai
 -#define lli_def_acl        u.d.d_def_acl
  #define lli_sa_lock        u.d.d_sa_lock
  #define lli_opendir_pid        u.d.d_opendir_pid
  
                /* for non-directory */
                struct {
 -                      struct semaphore                f_size_sem;
 -                      void                            *f_size_sem_owner;
 +                      struct mutex                    f_size_mutex;
                        char                            *f_symlink_name;
                        __u64                           f_maxbytes;
                        /*
                        /* for writepage() only to communicate to fsync */
                        int                             f_async_rc;
  
 -                      /* volatile file criteria is based on file name, this
 -                       * flag is used to keep the test result, so the strcmp
 -                       * is done only once
 -                       */
 -                      bool                            f_volatile;
                        /*
                         * whenever a process try to read/write the file, the
                         * jobid of the process will be saved here, and it'll
                        char                 f_jobid[JOBSTATS_JOBID_SIZE];
                } f;
  
 -#define lli_size_sem      u.f.f_size_sem
 -#define lli_size_sem_owner      u.f.f_size_sem_owner
 +#define lli_size_mutex          u.f.f_size_mutex
  #define lli_symlink_name      u.f.f_symlink_name
  #define lli_maxbytes      u.f.f_maxbytes
  #define lli_trunc_sem    u.f.f_trunc_sem
  #define lli_agl_index         u.f.f_agl_index
  #define lli_async_rc          u.f.f_async_rc
  #define lli_jobid             u.f.f_jobid
 -#define lli_volatile          u.f.f_volatile
  
        } u;
  
  
        /* mutex to request for layout lock exclusively. */
        struct mutex                    lli_layout_mutex;
 -      /* valid only inside LAYOUT ibits lock, protected by lli_layout_mutex */
 +      /* Layout version, protected by lli_layout_lock */
        __u32                           lli_layout_gen;
 +      spinlock_t                      lli_layout_lock;
  
        struct rw_semaphore             lli_xattrs_list_rwsem;
        struct mutex                    lli_xattrs_enq_lock;
        struct list_head                lli_xattrs;/* ll_xattr_entry->xe_list */
  };
  
 +static inline __u32 ll_layout_version_get(struct ll_inode_info *lli)
 +{
 +      __u32 gen;
 +
 +      spin_lock(&lli->lli_layout_lock);
 +      gen = lli->lli_layout_gen;
 +      spin_unlock(&lli->lli_layout_lock);
 +
 +      return gen;
 +}
 +
 +static inline void ll_layout_version_set(struct ll_inode_info *lli, __u32 gen)
 +{
 +      spin_lock(&lli->lli_layout_lock);
 +      lli->lli_layout_gen = gen;
 +      spin_unlock(&lli->lli_layout_lock);
 +}
 +
  int ll_xattr_cache_destroy(struct inode *inode);
  
  int ll_xattr_cache_get(struct inode *inode,
   * Locking to guarantee consistency of non-atomic updates to long long i_size,
   * consistency between file size and KMS.
   *
 - * Implemented by ->lli_size_sem and ->lsm_lock, nested in that order.
 + * Implemented by ->lli_size_mutex and ->lsm_lock, nested in that order.
   */
  
  void ll_inode_size_lock(struct inode *inode);
@@@ -441,6 -442,10 +441,6 @@@ enum stats_track_type 
        "xattr",        \
  }
  
 -/* default value for ll_sb_info->contention_time */
 -#define SBI_DEFAULT_CONTENTION_SECONDS     60
 -/* default value for lockless_truncate_enable */
 -#define SBI_DEFAULT_LOCKLESS_TRUNCATE_ENABLE 1
  #define RCE_HASHES      32
  
  struct rmtacl_ctl_entry {
@@@ -651,6 -656,12 +651,6 @@@ static inline struct inode *ll_info2i(s
        return &lli->lli_vfs_inode;
  }
  
 -struct it_cb_data {
 -      struct inode  *icbd_parent;
 -      struct dentry **icbd_childp;
 -      obd_id  hash;
 -};
 -
  __u32 ll_i2suppgid(struct inode *i);
  void ll_i2gids(__u32 *suppgids, struct inode *i1,struct inode *i2);
  
@@@ -658,13 -669,21 +658,13 @@@ static inline int ll_need_32bit_api(str
  {
  #if BITS_PER_LONG == 32
        return 1;
 +#elif defined(CONFIG_COMPAT)
 +      return unlikely(is_compat_task() || (sbi->ll_flags & LL_SBI_32BIT_API));
  #else
 -      return unlikely(
 -#ifdef CONFIG_COMPAT
 -              is_compat_task() ||
 -#endif
 -              (sbi->ll_flags & LL_SBI_32BIT_API)
 -      );
 +      return unlikely(sbi->ll_flags & LL_SBI_32BIT_API);
  #endif
  }
  
 -#define LLAP_MAGIC 98764321
 -
 -extern struct kmem_cache *ll_async_page_slab;
 -extern size_t ll_async_page_slab_size;
 -
  void ll_ra_read_in(struct file *f, struct ll_ra_read *rar);
  void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar);
  struct ll_ra_read *ll_ra_read_get(struct file *f);
@@@ -697,16 -716,14 +697,16 @@@ static inline void ll_rw_stats_tally(st
  
  /* llite/dir.c */
  void ll_release_page(struct page *page, int remove);
 -extern struct file_operations ll_dir_operations;
 -extern struct inode_operations ll_dir_inode_operations;
 +extern const struct file_operations ll_dir_operations;
 +extern const struct inode_operations ll_dir_inode_operations;
  struct page *ll_get_dir_page(struct inode *dir, __u64 hash,
                             struct ll_dir_chain *chain);
  int ll_dir_read(struct inode *inode, struct dir_context *ctx);
  
  int ll_get_mdt_idx(struct inode *inode);
  /* llite/namei.c */
 +extern const struct inode_operations ll_special_inode_operations;
 +
  int ll_objects_destroy(struct ptlrpc_request *request,
                       struct inode *dir);
  struct inode *ll_iget(struct super_block *sb, ino_t hash,
@@@ -721,34 -738,43 +721,34 @@@ int ll_prepare_write(struct file *, str
  int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to);
  int ll_writepage(struct page *page, struct writeback_control *wbc);
  int ll_writepages(struct address_space *, struct writeback_control *wbc);
 -void ll_removepage(struct page *page);
  int ll_readpage(struct file *file, struct page *page);
  void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
 -int ll_file_punch(struct inode *, loff_t, int);
 -ssize_t ll_file_lockless_io(struct file *, char *, size_t, loff_t *, int);
 -void ll_clear_file_contended(struct inode*);
 -int ll_sync_page_range(struct inode *, struct address_space *, loff_t, size_t);
  int ll_readahead(const struct lu_env *env, struct cl_io *io,
                 struct ll_readahead_state *ras, struct address_space *mapping,
                 struct cl_page_list *queue, int flags);
  
 +#ifndef MS_HAS_NEW_AOPS
 +extern const struct address_space_operations ll_aops;
 +#else
 +extern const struct address_space_operations_ext ll_aops;
 +#endif
 +
  /* llite/file.c */
  extern struct file_operations ll_file_operations;
  extern struct file_operations ll_file_operations_flock;
  extern struct file_operations ll_file_operations_noflock;
  extern struct inode_operations ll_file_inode_operations;
 -extern int ll_inode_revalidate_it(struct dentry *, struct lookup_intent *,
 -                                __u64);
  extern int ll_have_md_lock(struct inode *inode, __u64 *bits,
                           ldlm_mode_t l_req_mode);
  extern ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
                                   struct lustre_handle *lockh, __u64 flags,
                                   ldlm_mode_t mode);
 -int __ll_inode_revalidate_it(struct dentry *, struct lookup_intent *,
 -                           __u64 bits);
 -int ll_revalidate_nd(struct dentry *dentry, unsigned int flags);
  int ll_file_open(struct inode *inode, struct file *file);
  int ll_file_release(struct inode *inode, struct file *file);
  int ll_glimpse_ioctl(struct ll_sb_info *sbi,
                     struct lov_stripe_md *lsm, lstat_t *st);
  void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch);
 -int ll_local_open(struct file *file,
 -                struct lookup_intent *it, struct ll_file_data *fd,
 -                struct obd_client_handle *och);
  int ll_release_openhandle(struct dentry *, struct lookup_intent *);
 -int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 -              struct file *file);
  int ll_md_real_close(struct inode *inode, fmode_t fmode);
  void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data,
                      struct obd_client_handle **och, unsigned long flags);
@@@ -756,10 -782,15 +756,10 @@@ void ll_done_writing_attr(struct inode 
  int ll_som_update(struct inode *inode, struct md_op_data *op_data);
  int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
                     __u64 ioepoch, int sync);
 -int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data,
 -                struct md_open_data **mod);
  void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
                          struct lustre_handle *fh);
 -int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
 -             struct lookup_intent *it, struct kstat *stat);
  int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
 -struct ll_file_data *ll_file_data_get(void);
 -struct posix_acl * ll_get_acl(struct inode *inode, int type);
 +struct posix_acl *ll_get_acl(struct inode *inode, int type);
  
  int ll_inode_permission(struct inode *inode, int mask);
  
@@@ -774,30 -805,44 +774,30 @@@ int ll_dir_setstripe(struct inode *inod
  int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
                     int *lmm_size, struct ptlrpc_request **request);
  int ll_fsync(struct file *file, loff_t start, loff_t end, int data);
 -int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
 -            int num_bytes);
  int ll_merge_lvb(const struct lu_env *env, struct inode *inode);
 -int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg);
 -int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  int ll_fid2path(struct inode *inode, void *arg);
  int ll_data_version(struct inode *inode, __u64 *data_version, int extent_lock);
  int ll_hsm_release(struct inode *inode);
  
 -struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
 -                                      fmode_t mode, __u64 flags);
 -int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
 -                 bool *lease_broken);
 -
  /* llite/dcache.c */
  
  int ll_d_init(struct dentry *de);
 -extern struct dentry_operations ll_d_ops;
 +extern const struct dentry_operations ll_d_ops;
  void ll_intent_drop_lock(struct lookup_intent *);
  void ll_intent_release(struct lookup_intent *);
  void ll_invalidate_aliases(struct inode *);
 -void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft);
  void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry);
 -int ll_dcompare(const struct dentry *parent, const struct dentry *dentry,
 -              unsigned int len, const char *str, const struct qstr *d_name);
  int ll_revalidate_it_finish(struct ptlrpc_request *request,
                            struct lookup_intent *it, struct dentry *de);
  
  /* llite/llite_lib.c */
  extern struct super_operations lustre_super_operations;
  
 -char *ll_read_opt(const char *opt, char *data);
  void ll_lli_init(struct ll_inode_info *lli);
  int ll_fill_super(struct super_block *sb, struct vfsmount *mnt);
  void ll_put_super(struct super_block *sb);
  void ll_kill_super(struct super_block *sb);
  struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock);
 -struct inode *ll_inode_from_lock(struct ldlm_lock *lock);
  void ll_clear_inode(struct inode *inode);
  int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import);
  int ll_setattr(struct dentry *de, struct iattr *attr);
@@@ -817,11 -862,9 +817,11 @@@ void ll_dirty_page_discard_warn(struct 
  int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
                  struct super_block *, struct lookup_intent *);
  void lustre_dump_dentry(struct dentry *, int recur);
 -void lustre_dump_inode(struct inode *);
  int ll_obd_statfs(struct inode *inode, void *arg);
  int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
 +int ll_get_default_mdsize(struct ll_sb_info *sbi, int *default_mdsize);
 +int ll_get_max_cookiesize(struct ll_sb_info *sbi, int *max_cookiesize);
 +int ll_get_default_cookiesize(struct ll_sb_info *sbi, int *default_cookiesize);
  int ll_process_config(struct lustre_cfg *lcfg);
  struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
                                      struct inode *i1, struct inode *i2,
@@@ -838,6 -881,15 +838,6 @@@ void get_uuid2fsid(const char *name, in
  struct inode *search_inode_for_lustre(struct super_block *sb,
                                      const struct lu_fid *fid);
  
 -/* llite/special.c */
 -extern struct inode_operations ll_special_inode_operations;
 -extern struct file_operations ll_special_chr_inode_fops;
 -extern struct file_operations ll_special_chr_file_fops;
 -extern struct file_operations ll_special_blk_inode_fops;
 -extern struct file_operations ll_special_fifo_inode_fops;
 -extern struct file_operations ll_special_fifo_file_fops;
 -extern struct file_operations ll_special_sock_inode_fops;
 -
  /* llite/symlink.c */
  extern struct inode_operations ll_fast_symlink_inode_operations;
  
@@@ -905,6 -957,11 +905,6 @@@ struct vvp_io 
         * Set when cui_bead has been initialized.
         */
        int               cui_ra_window_set;
 -      /**
 -       * Partially truncated page, that vvp_io_trunc_start() keeps locked
 -       * across truncate.
 -       */
 -      struct cl_page      *cui_partpage;
  };
  
  /**
@@@ -917,8 -974,7 +917,7 @@@ struct vvp_io_args 
        union {
                struct {
                        struct kiocb      *via_iocb;
-                       struct iovec      *via_iov;
-                       unsigned long      via_nrsegs;
+                       struct iov_iter   *via_iter;
                } normal;
                struct {
                        struct pipe_inode_info  *via_pipe;
@@@ -933,9 -989,12 +932,9 @@@ struct ll_cl_context 
        struct cl_page *lcc_page;
        struct lu_env  *lcc_env;
        int          lcc_refcheck;
 -      int          lcc_created;
  };
  
  struct vvp_thread_info {
 -      struct ost_lvb       vti_lvb;
 -      struct cl_2queue     vti_queue;
        struct iovec     vti_local_iov;
        struct vvp_io_args   vti_args;
        struct ra_io_arg     vti_ria;
@@@ -982,17 -1041,25 +981,17 @@@ static inline struct vvp_io *vvp_env_io
        return &vvp_env_session(env)->vs_ios;
  }
  
 +int vvp_global_init(void);
 +void vvp_global_fini(void);
 +
  void ll_queue_done_writing(struct inode *inode, unsigned long flags);
  void ll_close_thread_shutdown(struct ll_close_queue *lcq);
  int ll_close_thread_start(struct ll_close_queue **lcq_ret);
  
  /* llite/llite_mmap.c */
 -typedef struct rb_root  rb_root_t;
 -typedef struct rb_node  rb_node_t;
 -
 -struct ll_lock_tree_node;
 -struct ll_lock_tree {
 -      rb_root_t                      lt_root;
 -      struct list_head                      lt_locked_list;
 -      struct ll_file_data         *lt_fd;
 -};
  
  int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last);
  int ll_file_mmap(struct file * file, struct vm_area_struct * vma);
 -struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
 -                                            __u64 end, ldlm_mode_t mode);
  void policy_from_vma(ldlm_policy_data_t *policy,
                struct vm_area_struct *vma, unsigned long addr, size_t count);
  struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
@@@ -1059,6 -1126,11 +1058,6 @@@ static inline struct lu_fid *ll_inode2f
        return fid;
  }
  
 -static inline int ll_mds_max_easize(struct super_block *sb)
 -{
 -      return sbi2mdc(ll_s2sbi(sb))->cl_max_mds_easize;
 -}
 -
  static inline __u64 ll_file_maxbytes(struct inode *inode)
  {
        return ll_i2info(inode)->lli_maxbytes;
@@@ -1076,6 -1148,7 +1075,6 @@@ int ll_removexattr(struct dentry *dentr
  extern struct kmem_cache *ll_remote_perm_cachep;
  extern struct kmem_cache *ll_rmtperm_hash_cachep;
  
 -struct hlist_head *alloc_rmtperm_hash(void);
  void free_rmtperm_hash(struct hlist_head *hash);
  int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm);
  int lustre_check_remote_perm(struct inode *inode, int mask);
@@@ -1088,6 -1161,7 +1087,6 @@@ void ll_capa_thread_stop(void)
  void ll_capa_timer_callback(unsigned long unused);
  
  struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa);
 -int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa);
  
  void ll_capa_open(struct inode *inode);
  void ll_capa_close(struct inode *inode);
@@@ -1107,12 -1181,14 +1106,12 @@@ extern struct lu_device_type vvp_device
   */
  int cl_sb_init(struct super_block *sb);
  int cl_sb_fini(struct super_block *sb);
 -enum cl_lock_mode  vvp_mode_from_vma(struct vm_area_struct *vma);
  void ll_io_init(struct cl_io *io, const struct file *file, int write);
  
  void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                struct ll_readahead_state *ras, unsigned long index,
                unsigned hit);
  void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len);
 -int ll_is_file_contended(struct file *file);
  void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which);
  
  /* llite/llite_rmtacl.c */
@@@ -1185,6 -1261,7 +1184,6 @@@ struct ll_statahead_info 
        unsigned int        sai_skip_hidden;/* skipped hidden dentry count */
        unsigned int        sai_ls_all:1,   /* "ls -al", do stat-ahead for
                                                 * hidden entries */
 -                              sai_in_readpage:1,/* statahead is in readdir()*/
                                sai_agl_valid:1;/* AGL is valid for the dir */
        wait_queue_head_t            sai_waitq;      /* stat-ahead wait queue */
        struct ptlrpc_thread    sai_thread;     /* stat-ahead thread */
@@@ -1309,6 -1386,9 +1308,6 @@@ typedef enum llioc_iter (*llioc_callbac
                struct file *file, unsigned int cmd, unsigned long arg,
                void *magic, int *rcp);
  
 -enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
 -              unsigned int cmd, unsigned long arg, int *rcp);
 -
  /* export functions */
  /* Register ioctl block dynamatically for a regular file.
   *
@@@ -1350,7 -1430,7 +1349,7 @@@ static inline void cl_isize_unlock(stru
  
  static inline void cl_isize_write_nolock(struct inode *inode, loff_t kms)
  {
 -      LASSERT(down_trylock(&ll_i2info(inode)->lli_size_sem) != 0);
 +      LASSERT(mutex_is_locked(&ll_i2info(inode)->lli_size_mutex));
        i_size_write(inode, kms);
  }
  
index f0122c568a099fbb2ea519ddc4bca5d97a34495a,b345dfa599f3d864145f34e2d57426cf4fd75bdd..56162103cc79c2dad9038abb11d715c6e9909f45
@@@ -77,6 -77,12 +77,6 @@@ static void ll_cl_fini(struct ll_cl_con
                cl_page_put(env, page);
        }
  
 -      if (io && lcc->lcc_created) {
 -              cl_io_end(env, io);
 -              cl_io_unlock(env, io);
 -              cl_io_iter_fini(env, io);
 -              cl_io_fini(env, io);
 -      }
        cl_env_put(env, &lcc->lcc_refcheck);
  }
  
@@@ -151,8 -157,7 +151,7 @@@ static struct ll_cl_context *ll_cl_init
                result = cl_io_rw_init(env, io, CIT_WRITE, pos, PAGE_CACHE_SIZE);
                if (result == 0) {
                        cio->cui_fd = LUSTRE_FPRIVATE(file);
-                       cio->cui_iov = NULL;
-                       cio->cui_nrsegs = 0;
+                       cio->cui_iter = NULL;
                        result = cl_io_iter_init(env, io);
                        if (result == 0) {
                                result = cl_io_lock(env, io);
                        }
                } else
                        result = io->ci_result;
 -              lcc->lcc_created = 1;
        }
  
        lcc->lcc_io = io;
index 55ca8d3c3e46451b654acdc298333035fcb11e4b,6b5994577b6b9f61bbb704341c72f0fa088183b7..af84c1aaa5f83f6c994a64da4b164535cbe54ac0
@@@ -218,14 -218,11 +218,11 @@@ static void ll_free_user_pages(struct p
        int i;
  
        for (i = 0; i < npages; i++) {
-               if (pages[i] == NULL)
-                       break;
                if (do_dirty)
                        set_page_dirty_lock(pages[i]);
                page_cache_release(pages[i]);
        }
-       OBD_FREE_LARGE(pages, npages * sizeof(*pages));
+       kvfree(pages);
  }
  
  ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
@@@ -363,18 -360,16 +360,16 @@@ static ssize_t ll_direct_IO_26_seg(cons
  #define MAX_DIO_SIZE ((MAX_MALLOC / sizeof(struct brw_page) * PAGE_CACHE_SIZE) & \
                      ~(DT_MAX_BRW_SIZE - 1))
  static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
-                              const struct iovec *iov, loff_t file_offset,
-                              unsigned long nr_segs)
+                              struct iov_iter *iter, loff_t file_offset)
  {
        struct lu_env *env;
        struct cl_io *io;
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        struct ccc_object *obj = cl_inode2ccc(inode);
-       long count = iov_length(iov, nr_segs);
-       long tot_bytes = 0, result = 0;
+       ssize_t count = iov_iter_count(iter);
+       ssize_t tot_bytes = 0, result = 0;
        struct ll_inode_info *lli = ll_i2info(inode);
-       unsigned long seg = 0;
        long size = MAX_DIO_SIZE;
        int refcheck;
  
        if ((file_offset & ~CFS_PAGE_MASK) || (count & ~CFS_PAGE_MASK))
                return -EINVAL;
  
 -      CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), size=%lu (max %lu), "
 -             "offset=%lld=%llx, pages %lu (max %lu)\n",
 +      CDEBUG(D_VFSTRACE,
 +             "VFS Op:inode=%lu/%u(%p), size=%zd (max %lu), offset=%lld=%llx, pages %zd (max %lu)\n",
               inode->i_ino, inode->i_generation, inode, count, MAX_DIO_SIZE,
               file_offset, file_offset, count >> PAGE_CACHE_SHIFT,
               MAX_DIO_SIZE >> PAGE_CACHE_SHIFT);
  
        /* Check that all user buffers are aligned as well */
-       for (seg = 0; seg < nr_segs; seg++) {
-               if (((unsigned long)iov[seg].iov_base & ~CFS_PAGE_MASK) ||
-                   (iov[seg].iov_len & ~CFS_PAGE_MASK))
-                       return -EINVAL;
-       }
+       if (iov_iter_alignment(iter) & ~CFS_PAGE_MASK)
+               return -EINVAL;
  
        env = cl_env_get(&refcheck);
        LASSERT(!IS_ERR(env));
                mutex_lock(&inode->i_mutex);
  
        LASSERT(obj->cob_transient_pages == 0);
-       for (seg = 0; seg < nr_segs; seg++) {
-               long iov_left = iov[seg].iov_len;
-               unsigned long user_addr = (unsigned long)iov[seg].iov_base;
+       while (iov_iter_count(iter)) {
+               struct page **pages;
+               size_t offs;
  
+               count = min_t(size_t, iov_iter_count(iter), size);
                if (rw == READ) {
                        if (file_offset >= i_size_read(inode))
                                break;
-                       if (file_offset + iov_left > i_size_read(inode))
-                               iov_left = i_size_read(inode) - file_offset;
+                       if (file_offset + count > i_size_read(inode))
+                               count = i_size_read(inode) - file_offset;
                }
  
-               while (iov_left > 0) {
-                       struct page **pages;
-                       int page_count, max_pages = 0;
-                       long bytes;
-                       bytes = min(size, iov_left);
-                       page_count = ll_get_user_pages(rw, user_addr, bytes,
-                                                      &pages, &max_pages);
-                       if (likely(page_count > 0)) {
-                               if (unlikely(page_count <  max_pages))
-                                       bytes = page_count << PAGE_CACHE_SHIFT;
-                               result = ll_direct_IO_26_seg(env, io, rw, inode,
-                                                            file->f_mapping,
-                                                            bytes, file_offset,
-                                                            pages, page_count);
-                               ll_free_user_pages(pages, max_pages, rw==READ);
-                       } else if (page_count == 0) {
-                               GOTO(out, result = -EFAULT);
-                       } else {
-                               result = page_count;
-                       }
-                       if (unlikely(result <= 0)) {
-                               /* If we can't allocate a large enough buffer
-                                * for the request, shrink it to a smaller
-                                * PAGE_SIZE multiple and try again.
-                                * We should always be able to kmalloc for a
-                                * page worth of page pointers = 4MB on i386. */
-                               if (result == -ENOMEM &&
-                                   size > (PAGE_CACHE_SIZE / sizeof(*pages)) *
-                                          PAGE_CACHE_SIZE) {
-                                       size = ((((size / 2) - 1) |
-                                                ~CFS_PAGE_MASK) + 1) &
-                                               CFS_PAGE_MASK;
-                                       CDEBUG(D_VFSTRACE,"DIO size now %lu\n",
-                                              size);
-                                       continue;
-                               }
-                               GOTO(out, result);
+               result = iov_iter_get_pages_alloc(iter, &pages, count, &offs);
+               if (likely(result > 0)) {
+                       int n = (result + offs + PAGE_SIZE - 1) / PAGE_SIZE;
+                       result = ll_direct_IO_26_seg(env, io, rw, inode,
+                                                    file->f_mapping,
+                                                    result, file_offset,
+                                                    pages, n);
+                       ll_free_user_pages(pages, n, rw==READ);
+               }
+               if (unlikely(result <= 0)) {
+                       /* If we can't allocate a large enough buffer
+                        * for the request, shrink it to a smaller
+                        * PAGE_SIZE multiple and try again.
+                        * We should always be able to kmalloc for a
+                        * page worth of page pointers = 4MB on i386. */
+                       if (result == -ENOMEM &&
+                           size > (PAGE_CACHE_SIZE / sizeof(*pages)) *
+                                  PAGE_CACHE_SIZE) {
+                               size = ((((size / 2) - 1) |
+                                        ~CFS_PAGE_MASK) + 1) &
+                                       CFS_PAGE_MASK;
+                               CDEBUG(D_VFSTRACE,"DIO size now %lu\n",
+                                      size);
+                               continue;
                        }
  
-                       tot_bytes += result;
-                       file_offset += result;
-                       iov_left -= result;
-                       user_addr += result;
+                       GOTO(out, result);
                }
+               iov_iter_advance(iter, result);
+               tot_bytes += result;
+               file_offset += result;
        }
  out:
        LASSERT(obj->cob_transient_pages == 0);
@@@ -529,9 -507,9 +507,9 @@@ static int ll_write_end(struct file *fi
  }
  
  #ifdef CONFIG_MIGRATION
 -int ll_migratepage(struct address_space *mapping,
 -              struct page *newpage, struct page *page
 -              , enum migrate_mode mode
 +static int ll_migratepage(struct address_space *mapping,
 +                       struct page *newpage, struct page *page,
 +                       enum migrate_mode mode
                )
  {
        /* Always fail page migration until we have a proper implementation */
  #endif
  
  #ifndef MS_HAS_NEW_AOPS
 -struct address_space_operations ll_aops = {
 -      .readpage       = ll_readpage,
 -//    .readpages      = ll_readpages,
 +const struct address_space_operations ll_aops = {
 +      .readpage       = ll_readpage,
        .direct_IO      = ll_direct_IO_26,
        .writepage      = ll_writepage,
        .writepages     = ll_writepages,
  #ifdef CONFIG_MIGRATION
        .migratepage    = ll_migratepage,
  #endif
 -      .bmap      = NULL
  };
  #else
 -struct address_space_operations_ext ll_aops = {
 +const struct address_space_operations_ext ll_aops = {
        .orig_aops.readpage       = ll_readpage,
  //    .orig_aops.readpages      = ll_readpages,
        .orig_aops.direct_IO      = ll_direct_IO_26,
  #ifdef CONFIG_MIGRATION
        .orig_aops.migratepage    = ll_migratepage,
  #endif
 -      .orig_aops.bmap    = NULL,
        .write_begin    = ll_write_begin,
        .write_end      = ll_write_end
  };
index 7dd2b4723c5fd6fdded98fadbcb63a68ce8c80fa,cfe8c625ae6403c72a6fa7513e7cce79fe8c8570..0e0b404cb5e6cc3b33dc8b736675485a617cdd55
@@@ -80,7 -80,7 +80,7 @@@ static bool can_populate_pages(const st
        case CIT_WRITE:
                /* don't need lock here to check lli_layout_gen as we have held
                 * extent lock and GROUP lock has to hold to swap layout */
 -              if (lli->lli_layout_gen != cio->cui_layout_gen) {
 +              if (ll_layout_version_get(lli) != cio->cui_layout_gen) {
                        io->ci_need_restart = 1;
                        /* this will return application a short read/write */
                        io->ci_continue = 0;
@@@ -190,7 -190,7 +190,7 @@@ static void vvp_io_fault_fini(const str
        vvp_io_fini(env, ios);
  }
  
 -enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma)
 +static enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma)
  {
        /*
         * we only want to hold PW locks if the mmap() can generate
@@@ -211,27 -211,26 +211,26 @@@ static int vvp_mmap_locks(const struct 
        struct cl_lock_descr   *descr = &cti->cti_descr;
        ldlm_policy_data_t      policy;
        unsigned long      addr;
-       unsigned long      seg;
        ssize_t          count;
        int                  result;
+       struct iov_iter i;
+       struct iovec iov;
  
        LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
  
        if (!cl_is_normalio(env, io))
                return 0;
  
-       if (vio->cui_iov == NULL) /* nfs or loop back device write */
+       if (vio->cui_iter == NULL) /* nfs or loop back device write */
                return 0;
  
        /* No MM (e.g. NFS)? No vmas too. */
        if (mm == NULL)
                return 0;
  
-       for (seg = 0; seg < vio->cui_nrsegs; seg++) {
-               const struct iovec *iv = &vio->cui_iov[seg];
-               addr = (unsigned long)iv->iov_base;
-               count = iv->iov_len;
+       iov_for_each(iov, i, *(vio->cui_iter)) {
+               addr = (unsigned long)iov.iov_base;
+               count = iov.iov_len;
                if (count == 0)
                        continue;
  
@@@ -527,9 -526,7 +526,7 @@@ static int vvp_io_read_start(const stru
        switch (vio->cui_io_subtype) {
        case IO_NORMAL:
                LASSERT(cio->cui_iocb->ki_pos == pos);
-               result = generic_file_aio_read(cio->cui_iocb,
-                                              cio->cui_iov, cio->cui_nrsegs,
-                                              cio->cui_iocb->ki_pos);
+               result = generic_file_read_iter(cio->cui_iocb, cio->cui_iter);
                break;
        case IO_SPLICE:
                result = generic_file_splice_read(file, &pos,
@@@ -595,12 -592,11 +592,11 @@@ static int vvp_io_write_start(const str
  
        CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt);
  
-       if (cio->cui_iov == NULL) /* from a temp io in ll_cl_init(). */
+       if (cio->cui_iter == NULL) /* from a temp io in ll_cl_init(). */
                result = 0;
        else
-               result = generic_file_aio_write(cio->cui_iocb,
-                                               cio->cui_iov, cio->cui_nrsegs,
-                                               cio->cui_iocb->ki_pos);
+               result = generic_file_write_iter(cio->cui_iocb, cio->cui_iter);
        if (result > 0) {
                if (result < cnt)
                        io->ci_continue = 0;
@@@ -1162,10 -1158,9 +1158,9 @@@ int vvp_io_init(const struct lu_env *en
                 *  results."  -- Single Unix Spec */
                if (count == 0)
                        result = 1;
-               else {
+               else
                        cio->cui_tot_count = count;
-                       cio->cui_tot_nrsegs = 0;
-               }
                /* for read/write, we store the jobid in the inode, and
                 * it'll be fetched by osc when building RPC.
                 *
index ff205a7bc55c9aefcd98994aeb3c25fe4c37eb28,a8898df131ed5b4abdf37adb8784645e88840381..648f9e489b39bb3a291f091771dcd099013176db
@@@ -43,7 -43,7 +43,7 @@@ struct usb_interface_descriptor fsg_int
        .bInterfaceProtocol =   USB_PR_BULK,    /* Adjusted during fsg_bind() */
        .iInterface =           FSG_STRING_INTERFACE,
  };
 -EXPORT_SYMBOL(fsg_intf_desc);
 +EXPORT_SYMBOL_GPL(fsg_intf_desc);
  
  /*
   * Three full-speed endpoint descriptors: bulk-in, bulk-out, and
@@@ -58,7 -58,7 +58,7 @@@ struct usb_endpoint_descriptor fsg_fs_b
        .bmAttributes =         USB_ENDPOINT_XFER_BULK,
        /* wMaxPacketSize set by autoconfiguration */
  };
 -EXPORT_SYMBOL(fsg_fs_bulk_in_desc);
 +EXPORT_SYMBOL_GPL(fsg_fs_bulk_in_desc);
  
  struct usb_endpoint_descriptor fsg_fs_bulk_out_desc = {
        .bLength =              USB_DT_ENDPOINT_SIZE,
@@@ -68,7 -68,7 +68,7 @@@
        .bmAttributes =         USB_ENDPOINT_XFER_BULK,
        /* wMaxPacketSize set by autoconfiguration */
  };
 -EXPORT_SYMBOL(fsg_fs_bulk_out_desc);
 +EXPORT_SYMBOL_GPL(fsg_fs_bulk_out_desc);
  
  struct usb_descriptor_header *fsg_fs_function[] = {
        (struct usb_descriptor_header *) &fsg_intf_desc,
@@@ -76,7 -76,7 +76,7 @@@
        (struct usb_descriptor_header *) &fsg_fs_bulk_out_desc,
        NULL,
  };
 -EXPORT_SYMBOL(fsg_fs_function);
 +EXPORT_SYMBOL_GPL(fsg_fs_function);
  
  
  /*
@@@ -95,7 -95,7 +95,7 @@@ struct usb_endpoint_descriptor fsg_hs_b
        .bmAttributes =         USB_ENDPOINT_XFER_BULK,
        .wMaxPacketSize =       cpu_to_le16(512),
  };
 -EXPORT_SYMBOL(fsg_hs_bulk_in_desc);
 +EXPORT_SYMBOL_GPL(fsg_hs_bulk_in_desc);
  
  struct usb_endpoint_descriptor fsg_hs_bulk_out_desc = {
        .bLength =              USB_DT_ENDPOINT_SIZE,
        .wMaxPacketSize =       cpu_to_le16(512),
        .bInterval =            1,      /* NAK every 1 uframe */
  };
 -EXPORT_SYMBOL(fsg_hs_bulk_out_desc);
 +EXPORT_SYMBOL_GPL(fsg_hs_bulk_out_desc);
  
  
  struct usb_descriptor_header *fsg_hs_function[] = {
        (struct usb_descriptor_header *) &fsg_hs_bulk_out_desc,
        NULL,
  };
 -EXPORT_SYMBOL(fsg_hs_function);
 +EXPORT_SYMBOL_GPL(fsg_hs_function);
  
  struct usb_endpoint_descriptor fsg_ss_bulk_in_desc = {
        .bLength =              USB_DT_ENDPOINT_SIZE,
        .bmAttributes =         USB_ENDPOINT_XFER_BULK,
        .wMaxPacketSize =       cpu_to_le16(1024),
  };
 -EXPORT_SYMBOL(fsg_ss_bulk_in_desc);
 +EXPORT_SYMBOL_GPL(fsg_ss_bulk_in_desc);
  
  struct usb_ss_ep_comp_descriptor fsg_ss_bulk_in_comp_desc = {
        .bLength =              sizeof(fsg_ss_bulk_in_comp_desc),
  
        /*.bMaxBurst =          DYNAMIC, */
  };
 -EXPORT_SYMBOL(fsg_ss_bulk_in_comp_desc);
 +EXPORT_SYMBOL_GPL(fsg_ss_bulk_in_comp_desc);
  
  struct usb_endpoint_descriptor fsg_ss_bulk_out_desc = {
        .bLength =              USB_DT_ENDPOINT_SIZE,
        .bmAttributes =         USB_ENDPOINT_XFER_BULK,
        .wMaxPacketSize =       cpu_to_le16(1024),
  };
 -EXPORT_SYMBOL(fsg_ss_bulk_out_desc);
 +EXPORT_SYMBOL_GPL(fsg_ss_bulk_out_desc);
  
  struct usb_ss_ep_comp_descriptor fsg_ss_bulk_out_comp_desc = {
        .bLength =              sizeof(fsg_ss_bulk_in_comp_desc),
  
        /*.bMaxBurst =          DYNAMIC, */
  };
 -EXPORT_SYMBOL(fsg_ss_bulk_out_comp_desc);
 +EXPORT_SYMBOL_GPL(fsg_ss_bulk_out_comp_desc);
  
  struct usb_descriptor_header *fsg_ss_function[] = {
        (struct usb_descriptor_header *) &fsg_intf_desc,
        (struct usb_descriptor_header *) &fsg_ss_bulk_out_comp_desc,
        NULL,
  };
 -EXPORT_SYMBOL(fsg_ss_function);
 +EXPORT_SYMBOL_GPL(fsg_ss_function);
  
  
   /*-------------------------------------------------------------------------*/
@@@ -179,7 -179,7 +179,7 @@@ void fsg_lun_close(struct fsg_lun *curl
                curlun->filp = NULL;
        }
  }
 -EXPORT_SYMBOL(fsg_lun_close);
 +EXPORT_SYMBOL_GPL(fsg_lun_close);
  
  int fsg_lun_open(struct fsg_lun *curlun, const char *filename)
  {
         * If we can't read the file, it's no good.
         * If we can't write the file, use it read-only.
         */
-       if (!(filp->f_op->read || filp->f_op->aio_read)) {
+       if (!(filp->f_mode & FMODE_CAN_READ)) {
                LINFO(curlun, "file not readable: %s\n", filename);
                goto out;
        }
-       if (!(filp->f_op->write || filp->f_op->aio_write))
+       if (!(filp->f_mode & FMODE_CAN_WRITE))
                ro = 1;
  
        size = i_size_read(inode->i_mapping->host);
@@@ -278,7 -278,7 +278,7 @@@ out
        fput(filp);
        return rc;
  }
 -EXPORT_SYMBOL(fsg_lun_open);
 +EXPORT_SYMBOL_GPL(fsg_lun_open);
  
  
  /*-------------------------------------------------------------------------*/
@@@ -295,7 -295,7 +295,7 @@@ int fsg_lun_fsync_sub(struct fsg_lun *c
                return 0;
        return vfs_fsync(filp, 1);
  }
 -EXPORT_SYMBOL(fsg_lun_fsync_sub);
 +EXPORT_SYMBOL_GPL(fsg_lun_fsync_sub);
  
  void store_cdrom_address(u8 *dest, int msf, u32 addr)
  {
                put_unaligned_be32(addr, dest);
        }
  }
 -EXPORT_SYMBOL(store_cdrom_address);
 +EXPORT_SYMBOL_GPL(store_cdrom_address);
  
  /*-------------------------------------------------------------------------*/
  
@@@ -325,13 -325,13 +325,13 @@@ ssize_t fsg_show_ro(struct fsg_lun *cur
                                  ? curlun->ro
                                  : curlun->initially_ro);
  }
 -EXPORT_SYMBOL(fsg_show_ro);
 +EXPORT_SYMBOL_GPL(fsg_show_ro);
  
  ssize_t fsg_show_nofua(struct fsg_lun *curlun, char *buf)
  {
        return sprintf(buf, "%u\n", curlun->nofua);
  }
 -EXPORT_SYMBOL(fsg_show_nofua);
 +EXPORT_SYMBOL_GPL(fsg_show_nofua);
  
  ssize_t fsg_show_file(struct fsg_lun *curlun, struct rw_semaphore *filesem,
                      char *buf)
        up_read(filesem);
        return rc;
  }
 -EXPORT_SYMBOL(fsg_show_file);
 +EXPORT_SYMBOL_GPL(fsg_show_file);
  
  ssize_t fsg_show_cdrom(struct fsg_lun *curlun, char *buf)
  {
        return sprintf(buf, "%u\n", curlun->cdrom);
  }
 -EXPORT_SYMBOL(fsg_show_cdrom);
 +EXPORT_SYMBOL_GPL(fsg_show_cdrom);
  
  ssize_t fsg_show_removable(struct fsg_lun *curlun, char *buf)
  {
        return sprintf(buf, "%u\n", curlun->removable);
  }
 -EXPORT_SYMBOL(fsg_show_removable);
 +EXPORT_SYMBOL_GPL(fsg_show_removable);
  
  /*
   * The caller must hold fsg->filesem for reading when calling this function.
@@@ -410,7 -410,7 +410,7 @@@ ssize_t fsg_store_ro(struct fsg_lun *cu
  
        return rc;
  }
 -EXPORT_SYMBOL(fsg_store_ro);
 +EXPORT_SYMBOL_GPL(fsg_store_ro);
  
  ssize_t fsg_store_nofua(struct fsg_lun *curlun, const char *buf, size_t count)
  {
  
        return count;
  }
 -EXPORT_SYMBOL(fsg_store_nofua);
 +EXPORT_SYMBOL_GPL(fsg_store_nofua);
  
  ssize_t fsg_store_file(struct fsg_lun *curlun, struct rw_semaphore *filesem,
                       const char *buf, size_t count)
        up_write(filesem);
        return (rc < 0 ? rc : count);
  }
 -EXPORT_SYMBOL(fsg_store_file);
 +EXPORT_SYMBOL_GPL(fsg_store_file);
  
  ssize_t fsg_store_cdrom(struct fsg_lun *curlun, struct rw_semaphore *filesem,
                        const char *buf, size_t count)
  
        return ret;
  }
 -EXPORT_SYMBOL(fsg_store_cdrom);
 +EXPORT_SYMBOL_GPL(fsg_store_cdrom);
  
  ssize_t fsg_store_removable(struct fsg_lun *curlun, const char *buf,
                            size_t count)
  
        return count;
  }
 -EXPORT_SYMBOL(fsg_store_removable);
 +EXPORT_SYMBOL_GPL(fsg_store_removable);
  
  MODULE_LICENSE("GPL");
diff --combined fs/9p/vfs_file.c
index 96e550760699a8895cbb58a4dd26a18ab1e050a3,b9b5f979a2ca7baafa426037229f6d208dd3710d..520c11c2dcca4c9ff31a591600ca0c8ced52481c
@@@ -352,6 -352,9 +352,6 @@@ static int v9fs_file_flock_dotl(struct 
                invalidate_mapping_pages(&inode->i_data, 0, -1);
        }
        /* Convert flock to posix lock */
 -      fl->fl_owner = (fl_owner_t)filp;
 -      fl->fl_start = 0;
 -      fl->fl_end = OFFSET_MAX;
        fl->fl_flags |= FL_POSIX;
        fl->fl_flags ^= FL_FLOCK;
  
@@@ -681,7 -684,7 +681,7 @@@ v9fs_direct_read(struct file *filp, cha
  /**
   * v9fs_cached_file_read - read from a file
   * @filp: file pointer to read
 - * @udata: user data buffer to read data into
 + * @data: user data buffer to read data into
   * @count: size of buffer
   * @offset: offset at which to read data
   *
@@@ -692,13 -695,13 +692,13 @@@ v9fs_cached_file_read(struct file *filp
  {
        if (filp->f_flags & O_DIRECT)
                return v9fs_direct_read(filp, data, count, offset);
-       return do_sync_read(filp, data, count, offset);
+       return new_sync_read(filp, data, count, offset);
  }
  
  /**
   * v9fs_mmap_file_read - read from a file
   * @filp: file pointer to read
 - * @udata: user data buffer to read data into
 + * @data: user data buffer to read data into
   * @count: size of buffer
   * @offset: offset at which to read data
   *
@@@ -760,7 -763,7 +760,7 @@@ err_out
  
  buff_write:
        mutex_unlock(&inode->i_mutex);
-       return do_sync_write(filp, data, count, offsetp);
+       return new_sync_write(filp, data, count, offsetp);
  }
  
  /**
@@@ -778,7 -781,7 +778,7 @@@ v9fs_cached_file_write(struct file *fil
  
        if (filp->f_flags & O_DIRECT)
                return v9fs_direct_write(filp, data, count, offset);
-       return do_sync_write(filp, data, count, offset);
+       return new_sync_write(filp, data, count, offset);
  }
  
  
@@@ -847,8 -850,8 +847,8 @@@ const struct file_operations v9fs_cache
        .llseek = generic_file_llseek,
        .read = v9fs_cached_file_read,
        .write = v9fs_cached_file_write,
-       .aio_read = generic_file_aio_read,
-       .aio_write = generic_file_aio_write,
+       .read_iter = generic_file_read_iter,
+       .write_iter = generic_file_write_iter,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
        .lock = v9fs_file_lock,
@@@ -860,8 -863,8 +860,8 @@@ const struct file_operations v9fs_cache
        .llseek = generic_file_llseek,
        .read = v9fs_cached_file_read,
        .write = v9fs_cached_file_write,
-       .aio_read = generic_file_aio_read,
-       .aio_write = generic_file_aio_write,
+       .read_iter = generic_file_read_iter,
+       .write_iter = generic_file_write_iter,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
        .lock = v9fs_file_lock_dotl,
diff --combined fs/affs/file.c
index 0270303388ee669515c8829f7370da24db6d16e2,9df23175e28b910e5cec924dd87249956d4051a7..a7fe57d2cd9a0aa6a59df2cd90778127ebc2bbc3
@@@ -27,10 -27,10 +27,10 @@@ static int affs_file_release(struct ino
  
  const struct file_operations affs_file_operations = {
        .llseek         = generic_file_llseek,
-       .read           = do_sync_read,
-       .aio_read       = generic_file_aio_read,
-       .write          = do_sync_write,
-       .aio_write      = generic_file_aio_write,
+       .read           = new_sync_read,
+       .read_iter      = generic_file_read_iter,
+       .write          = new_sync_write,
+       .write_iter     = generic_file_write_iter,
        .mmap           = generic_file_mmap,
        .open           = affs_file_open,
        .release        = affs_file_release,
@@@ -45,7 -45,7 +45,7 @@@ const struct inode_operations affs_file
  static int
  affs_file_open(struct inode *inode, struct file *filp)
  {
 -      pr_debug("AFFS: open(%lu,%d)\n",
 +      pr_debug("open(%lu,%d)\n",
                 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
        atomic_inc(&AFFS_I(inode)->i_opencnt);
        return 0;
@@@ -54,7 -54,7 +54,7 @@@
  static int
  affs_file_release(struct inode *inode, struct file *filp)
  {
 -      pr_debug("AFFS: release(%lu, %d)\n",
 +      pr_debug("release(%lu, %d)\n",
                 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
  
        if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) {
@@@ -324,8 -324,7 +324,8 @@@ affs_get_block(struct inode *inode, sec
        struct buffer_head      *ext_bh;
        u32                      ext;
  
 -      pr_debug("AFFS: get_block(%u, %lu)\n", (u32)inode->i_ino, (unsigned long)block);
 +      pr_debug("%s(%u, %lu)\n",
 +               __func__, (u32)inode->i_ino, (unsigned long)block);
  
        BUG_ON(block > (sector_t)0x7fffffffUL);
  
@@@ -499,36 -498,34 +499,36 @@@ affs_getemptyblk_ino(struct inode *inod
  }
  
  static int
 -affs_do_readpage_ofs(struct file *file, struct page *page, unsigned from, unsigned to)
 +affs_do_readpage_ofs(struct page *page, unsigned to)
  {
        struct inode *inode = page->mapping->host;
        struct super_block *sb = inode->i_sb;
        struct buffer_head *bh;
        char *data;
 +      unsigned pos = 0;
        u32 bidx, boff, bsize;
        u32 tmp;
  
 -      pr_debug("AFFS: read_page(%u, %ld, %d, %d)\n", (u32)inode->i_ino, page->index, from, to);
 -      BUG_ON(from > to || to > PAGE_CACHE_SIZE);
 +      pr_debug("%s(%u, %ld, 0, %d)\n", __func__, (u32)inode->i_ino,
 +               page->index, to);
 +      BUG_ON(to > PAGE_CACHE_SIZE);
        kmap(page);
        data = page_address(page);
        bsize = AFFS_SB(sb)->s_data_blksize;
 -      tmp = (page->index << PAGE_CACHE_SHIFT) + from;
 +      tmp = page->index << PAGE_CACHE_SHIFT;
        bidx = tmp / bsize;
        boff = tmp % bsize;
  
 -      while (from < to) {
 +      while (pos < to) {
                bh = affs_bread_ino(inode, bidx, 0);
                if (IS_ERR(bh))
                        return PTR_ERR(bh);
 -              tmp = min(bsize - boff, to - from);
 -              BUG_ON(from + tmp > to || tmp > bsize);
 -              memcpy(data + from, AFFS_DATA(bh) + boff, tmp);
 +              tmp = min(bsize - boff, to - pos);
 +              BUG_ON(pos + tmp > to || tmp > bsize);
 +              memcpy(data + pos, AFFS_DATA(bh) + boff, tmp);
                affs_brelse(bh);
                bidx++;
 -              from += tmp;
 +              pos += tmp;
                boff = 0;
        }
        flush_dcache_page(page);
@@@ -545,7 -542,7 +545,7 @@@ affs_extent_file_ofs(struct inode *inod
        u32 size, bsize;
        u32 tmp;
  
 -      pr_debug("AFFS: extent_file(%u, %d)\n", (u32)inode->i_ino, newsize);
 +      pr_debug("%s(%u, %d)\n", __func__, (u32)inode->i_ino, newsize);
        bsize = AFFS_SB(sb)->s_data_blksize;
        bh = NULL;
        size = AFFS_I(inode)->mmu_private;
@@@ -611,14 -608,14 +611,14 @@@ affs_readpage_ofs(struct file *file, st
        u32 to;
        int err;
  
 -      pr_debug("AFFS: read_page(%u, %ld)\n", (u32)inode->i_ino, page->index);
 +      pr_debug("%s(%u, %ld)\n", __func__, (u32)inode->i_ino, page->index);
        to = PAGE_CACHE_SIZE;
        if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) {
                to = inode->i_size & ~PAGE_CACHE_MASK;
                memset(page_address(page) + to, 0, PAGE_CACHE_SIZE - to);
        }
  
 -      err = affs_do_readpage_ofs(file, page, 0, to);
 +      err = affs_do_readpage_ofs(page, to);
        if (!err)
                SetPageUptodate(page);
        unlock_page(page);
@@@ -634,8 -631,7 +634,8 @@@ static int affs_write_begin_ofs(struct 
        pgoff_t index;
        int err = 0;
  
 -      pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len);
 +      pr_debug("%s(%u, %llu, %llu)\n", __func__, (u32)inode->i_ino,
 +               (unsigned long long)pos, (unsigned long long)pos + len);
        if (pos > AFFS_I(inode)->mmu_private) {
                /* XXX: this probably leaves a too-big i_size in case of
                 * failure. Should really be updating i_size at write_end time
                return 0;
  
        /* XXX: inefficient but safe in the face of short writes */
 -      err = affs_do_readpage_ofs(file, page, 0, PAGE_CACHE_SIZE);
 +      err = affs_do_readpage_ofs(page, PAGE_CACHE_SIZE);
        if (err) {
                unlock_page(page);
                page_cache_release(page);
@@@ -684,9 -680,7 +684,9 @@@ static int affs_write_end_ofs(struct fi
         * due to write_begin.
         */
  
 -      pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len);
 +      pr_debug("%s(%u, %llu, %llu)\n",
 +               __func__, (u32)inode->i_ino, (unsigned long long)pos,
 +              (unsigned long long)pos + len);
        bsize = AFFS_SB(sb)->s_data_blksize;
        data = page_address(page);
  
@@@ -808,7 -802,7 +808,7 @@@ affs_free_prealloc(struct inode *inode
  {
        struct super_block *sb = inode->i_sb;
  
 -      pr_debug("AFFS: free_prealloc(ino=%lu)\n", inode->i_ino);
 +      pr_debug("free_prealloc(ino=%lu)\n", inode->i_ino);
  
        while (AFFS_I(inode)->i_pa_cnt) {
                AFFS_I(inode)->i_pa_cnt--;
@@@ -828,7 -822,7 +828,7 @@@ affs_truncate(struct inode *inode
        struct buffer_head *ext_bh;
        int i;
  
 -      pr_debug("AFFS: truncate(inode=%d, oldsize=%u, newsize=%u)\n",
 +      pr_debug("truncate(inode=%d, oldsize=%u, newsize=%u)\n",
                 (u32)inode->i_ino, (u32)AFFS_I(inode)->mmu_private, (u32)inode->i_size);
  
        last_blk = 0;
diff --combined fs/block_dev.c
index 83fba15cc394071a53b57245b95942c18177d9c7,e68e150b1b163c15da172cfa60ed832d14841495..6d7274619bf916c2dcf0d7744ba8d888d948d711
@@@ -165,14 -165,15 +165,15 @@@ blkdev_get_block(struct inode *inode, s
  }
  
  static ssize_t
- blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-                       loff_t offset, unsigned long nr_segs)
+ blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+                       loff_t offset)
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
  
-       return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
-                                   nr_segs, blkdev_get_block, NULL, NULL, 0);
+       return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter,
+                                   offset, blkdev_get_block,
+                                   NULL, NULL, 0);
  }
  
  int __sync_blockdev(struct block_device *bdev, int wait)
@@@ -363,69 -364,6 +364,69 @@@ int blkdev_fsync(struct file *filp, lof
  }
  EXPORT_SYMBOL(blkdev_fsync);
  
 +/**
 + * bdev_read_page() - Start reading a page from a block device
 + * @bdev: The device to read the page from
 + * @sector: The offset on the device to read the page to (need not be aligned)
 + * @page: The page to read
 + *
 + * On entry, the page should be locked.  It will be unlocked when the page
 + * has been read.  If the block driver implements rw_page synchronously,
 + * that will be true on exit from this function, but it need not be.
 + *
 + * Errors returned by this function are usually "soft", eg out of memory, or
 + * queue full; callers should try a different route to read this page rather
 + * than propagate an error back up the stack.
 + *
 + * Return: negative errno if an error occurs, 0 if submission was successful.
 + */
 +int bdev_read_page(struct block_device *bdev, sector_t sector,
 +                      struct page *page)
 +{
 +      const struct block_device_operations *ops = bdev->bd_disk->fops;
 +      if (!ops->rw_page)
 +              return -EOPNOTSUPP;
 +      return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
 +}
 +EXPORT_SYMBOL_GPL(bdev_read_page);
 +
 +/**
 + * bdev_write_page() - Start writing a page to a block device
 + * @bdev: The device to write the page to
 + * @sector: The offset on the device to write the page to (need not be aligned)
 + * @page: The page to write
 + * @wbc: The writeback_control for the write
 + *
 + * On entry, the page should be locked and not currently under writeback.
 + * On exit, if the write started successfully, the page will be unlocked and
 + * under writeback.  If the write failed already (eg the driver failed to
 + * queue the page to the device), the page will still be locked.  If the
 + * caller is a ->writepage implementation, it will need to unlock the page.
 + *
 + * Errors returned by this function are usually "soft", eg out of memory, or
 + * queue full; callers should try a different route to write this page rather
 + * than propagate an error back up the stack.
 + *
 + * Return: negative errno if an error occurs, 0 if submission was successful.
 + */
 +int bdev_write_page(struct block_device *bdev, sector_t sector,
 +                      struct page *page, struct writeback_control *wbc)
 +{
 +      int result;
 +      int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
 +      const struct block_device_operations *ops = bdev->bd_disk->fops;
 +      if (!ops->rw_page)
 +              return -EOPNOTSUPP;
 +      set_page_writeback(page);
 +      result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
 +      if (result)
 +              end_page_writeback(page);
 +      else
 +              unlock_page(page);
 +      return result;
 +}
 +EXPORT_SYMBOL_GPL(bdev_write_page);
 +
  /*
   * pseudo-fs
   */
@@@ -1571,43 -1509,38 +1572,38 @@@ static long block_ioctl(struct file *fi
   * Does not take i_mutex for the write and thus is not for general purpose
   * use.
   */
- ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                        unsigned long nr_segs, loff_t pos)
+ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
  {
        struct file *file = iocb->ki_filp;
        struct blk_plug plug;
        ssize_t ret;
  
-       BUG_ON(iocb->ki_pos != pos);
        blk_start_plug(&plug);
-       ret = __generic_file_aio_write(iocb, iov, nr_segs);
+       ret = __generic_file_write_iter(iocb, from);
        if (ret > 0) {
                ssize_t err;
-               err = generic_write_sync(file, pos, ret);
+               err = generic_write_sync(file, iocb->ki_pos - ret, ret);
                if (err < 0)
                        ret = err;
        }
        blk_finish_plug(&plug);
        return ret;
  }
- EXPORT_SYMBOL_GPL(blkdev_aio_write);
+ EXPORT_SYMBOL_GPL(blkdev_write_iter);
  
- static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
-                        unsigned long nr_segs, loff_t pos)
+ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
  {
        struct file *file = iocb->ki_filp;
        struct inode *bd_inode = file->f_mapping->host;
        loff_t size = i_size_read(bd_inode);
+       loff_t pos = iocb->ki_pos;
  
        if (pos >= size)
                return 0;
  
        size -= pos;
-       if (size < iocb->ki_nbytes)
-               nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
-       return generic_file_aio_read(iocb, iov, nr_segs, pos);
+       iov_iter_truncate(to, size);
+       return generic_file_read_iter(iocb, to);
  }
  
  /*
@@@ -1639,10 -1572,10 +1635,10 @@@ const struct file_operations def_blk_fo
        .open           = blkdev_open,
        .release        = blkdev_close,
        .llseek         = block_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = blkdev_aio_read,
-       .aio_write      = blkdev_aio_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = blkdev_read_iter,
+       .write_iter     = blkdev_write_iter,
        .mmap           = generic_file_mmap,
        .fsync          = blkdev_fsync,
        .unlocked_ioctl = block_ioctl,
        .compat_ioctl   = compat_blkdev_ioctl,
  #endif
        .splice_read    = generic_file_splice_read,
-       .splice_write   = generic_file_splice_write,
+       .splice_write   = iter_file_splice_write,
  };
  
  int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
diff --combined fs/btrfs/file.c
index e472441feb5de80ce68c85e6f892531893706a13,17e7393c50f0a97484ec9ce7898c9b6cf7aacdfa..1f2b99cb55eaef682c51ae3c8b227e88aafbf7e7
@@@ -40,7 -40,6 +40,7 @@@
  #include "tree-log.h"
  #include "locking.h"
  #include "volumes.h"
 +#include "qgroup.h"
  
  static struct kmem_cache *btrfs_inode_defrag_cachep;
  /*
@@@ -448,7 -447,7 +448,7 @@@ static noinline int btrfs_copy_from_use
                write_bytes -= copied;
                total_copied += copied;
  
-               /* Return to btrfs_file_aio_write to fault page */
+               /* Return to btrfs_file_write_iter to fault page */
                if (unlikely(copied == 0))
                        break;
  
@@@ -471,12 -470,11 +471,12 @@@ static void btrfs_drop_pages(struct pag
        for (i = 0; i < num_pages; i++) {
                /* page checked is some magic around finding pages that
                 * have been modified without going through btrfs_set_page_dirty
 -               * clear it here
 +               * clear it here. There should be no need to mark the pages
 +               * accessed as prepare_pages should have marked them accessed
 +               * in prepare_pages via find_or_create_page()
                 */
                ClearPageChecked(pages[i]);
                unlock_page(pages[i]);
 -              mark_page_accessed(pages[i]);
                page_cache_release(pages[i]);
        }
  }
@@@ -716,7 -714,7 +716,7 @@@ int __btrfs_drop_extents(struct btrfs_t
        int recow;
        int ret;
        int modify_tree = -1;
 -      int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
 +      int update_refs;
        int found = 0;
        int leafs_visited = 0;
  
        if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
                modify_tree = 0;
  
 +      update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
 +                     root == root->fs_info->tree_root);
        while (1) {
                recow = 0;
                ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@@ -784,18 -780,6 +784,18 @@@ next_slot
                        extent_end = search_start;
                }
  
 +              /*
 +               * Don't skip extent items representing 0 byte lengths. They
 +               * used to be created (bug) if while punching holes we hit
 +               * -ENOSPC condition. So if we find one here, just ensure we
 +               * delete it, otherwise we would insert a new file extent item
 +               * with the same key (offset) as that 0 bytes length file
 +               * extent item in the call to setup_items_for_insert() later
 +               * in this function.
 +               */
 +              if (extent_end == key.offset && extent_end >= search_start)
 +                      goto delete_extent_item;
 +
                if (extent_end <= search_start) {
                        path->slots[0]++;
                        goto next_slot;
                                                disk_bytenr, num_bytes, 0,
                                                root->root_key.objectid,
                                                new_key.objectid,
 -                                              start - extent_offset, 0);
 +                                              start - extent_offset, 1);
                                BUG_ON(ret); /* -ENOMEM */
                        }
                        key.offset = start;
                 *    | ------ extent ------ |
                 */
                if (start <= key.offset && end >= extent_end) {
 +delete_extent_item:
                        if (del_nr == 0) {
                                del_slot = path->slots[0];
                                del_nr = 1;
@@@ -1208,7 -1191,7 +1208,7 @@@ again
  
                ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
                                           root->root_key.objectid,
 -                                         ino, orig_offset, 0);
 +                                         ino, orig_offset, 1);
                BUG_ON(ret); /* -ENOMEM */
  
                if (split == start) {
@@@ -1675,27 -1658,22 +1675,22 @@@ again
  }
  
  static ssize_t __btrfs_direct_write(struct kiocb *iocb,
-                                   const struct iovec *iov,
-                                   unsigned long nr_segs, loff_t pos,
-                                   size_t count, size_t ocount)
+                                   struct iov_iter *from,
+                                   loff_t pos)
  {
        struct file *file = iocb->ki_filp;
-       struct iov_iter i;
        ssize_t written;
        ssize_t written_buffered;
        loff_t endbyte;
        int err;
  
-       written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
-                                           count, ocount);
+       written = generic_file_direct_write(iocb, from, pos);
  
-       if (written < 0 || written == count)
+       if (written < 0 || !iov_iter_count(from))
                return written;
  
        pos += written;
-       count -= written;
-       iov_iter_init(&i, iov, nr_segs, count, written);
-       written_buffered = __btrfs_buffered_write(file, &i, pos);
+       written_buffered = __btrfs_buffered_write(file, from, pos);
        if (written_buffered < 0) {
                err = written_buffered;
                goto out;
@@@ -1730,9 -1708,8 +1725,8 @@@ static void update_time_for_write(struc
                inode_inc_iversion(inode);
  }
  
- static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
-                                   const struct iovec *iov,
-                                   unsigned long nr_segs, loff_t pos)
+ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
+                                   struct iov_iter *from)
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        u64 end_pos;
        ssize_t num_written = 0;
        ssize_t err = 0;
-       size_t count, ocount;
+       size_t count = iov_iter_count(from);
        bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
+       loff_t pos = iocb->ki_pos;
  
        mutex_lock(&inode->i_mutex);
  
-       err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-       if (err) {
-               mutex_unlock(&inode->i_mutex);
-               goto out;
-       }
-       count = ocount;
        current->backing_dev_info = inode->i_mapping->backing_dev_info;
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err) {
                goto out;
        }
  
+       iov_iter_truncate(from, count);
        err = file_remove_suid(file);
        if (err) {
                mutex_unlock(&inode->i_mutex);
                atomic_inc(&BTRFS_I(inode)->sync_writers);
  
        if (unlikely(file->f_flags & O_DIRECT)) {
-               num_written = __btrfs_direct_write(iocb, iov, nr_segs,
-                                                  pos, count, ocount);
+               num_written = __btrfs_direct_write(iocb, from, pos);
        } else {
-               struct iov_iter i;
-               iov_iter_init(&i, iov, nr_segs, count, num_written);
-               num_written = __btrfs_buffered_write(file, &i, pos);
+               num_written = __btrfs_buffered_write(file, from, pos);
                if (num_written > 0)
                        iocb->ki_pos = pos + num_written;
        }
@@@ -2026,10 -1994,8 +2011,10 @@@ int btrfs_sync_file(struct file *file, 
                if (!full_sync) {
                        ret = btrfs_wait_ordered_range(inode, start,
                                                       end - start + 1);
 -                      if (ret)
 +                      if (ret) {
 +                              btrfs_end_transaction(trans, root);
                                goto out;
 +                      }
                }
                ret = btrfs_commit_transaction(trans, root);
        } else {
        return 0;
  }
  
 +/*
 + * Find a hole extent on given inode and change start/len to the end of hole
 + * extent.(hole/vacuum extent whose em->start <= start &&
 + *       em->start + em->len > start)
 + * When a hole extent is found, return 1 and modify start/len.
 + */
 +static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
 +{
 +      struct extent_map *em;
 +      int ret = 0;
 +
 +      em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0);
 +      if (IS_ERR_OR_NULL(em)) {
 +              if (!em)
 +                      ret = -ENOMEM;
 +              else
 +                      ret = PTR_ERR(em);
 +              return ret;
 +      }
 +
 +      /* Hole or vacuum extent(only exists in no-hole mode) */
 +      if (em->block_start == EXTENT_MAP_HOLE) {
 +              ret = 1;
 +              *len = em->start + em->len > *start + *len ?
 +                     0 : *start + *len - em->start - em->len;
 +              *start = em->start + em->len;
 +      }
 +      free_extent_map(em);
 +      return ret;
 +}
 +
  static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
  {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_path *path;
        struct btrfs_block_rsv *rsv;
        struct btrfs_trans_handle *trans;
 -      u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
 -      u64 lockend = round_down(offset + len,
 -                               BTRFS_I(inode)->root->sectorsize) - 1;
 -      u64 cur_offset = lockstart;
 +      u64 lockstart;
 +      u64 lockend;
 +      u64 tail_start;
 +      u64 tail_len;
 +      u64 orig_start = offset;
 +      u64 cur_offset;
        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
        u64 drop_end;
        int ret = 0;
        int err = 0;
        int rsv_count;
 -      bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
 -                        ((offset + len - 1) >> PAGE_CACHE_SHIFT));
 +      bool same_page;
        bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
 -      u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
 +      u64 ino_size;
  
        ret = btrfs_wait_ordered_range(inode, offset, len);
        if (ret)
                return ret;
  
        mutex_lock(&inode->i_mutex);
 +      ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
 +      ret = find_first_non_hole(inode, &offset, &len);
 +      if (ret < 0)
 +              goto out_only_mutex;
 +      if (ret && !len) {
 +              /* Already in a large hole */
 +              ret = 0;
 +              goto out_only_mutex;
 +      }
 +
 +      lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize);
 +      lockend = round_down(offset + len,
 +                           BTRFS_I(inode)->root->sectorsize) - 1;
 +      same_page = ((offset >> PAGE_CACHE_SHIFT) ==
 +                  ((offset + len - 1) >> PAGE_CACHE_SHIFT));
 +
        /*
         * We needn't truncate any page which is beyond the end of the file
         * because we are sure there is no data there.
        if (same_page && len < PAGE_CACHE_SIZE) {
                if (offset < ino_size)
                        ret = btrfs_truncate_page(inode, offset, len, 0);
 -              mutex_unlock(&inode->i_mutex);
 -              return ret;
 +              goto out_only_mutex;
        }
  
        /* zero back part of the first page */
                }
        }
  
 -      /* zero the front end of the last page */
 -      if (offset + len < ino_size) {
 -              ret = btrfs_truncate_page(inode, offset + len, 0, 1);
 -              if (ret) {
 -                      mutex_unlock(&inode->i_mutex);
 -                      return ret;
 +      /* Check the aligned pages after the first unaligned page,
 +       * if offset != orig_start, which means the first unaligned page
 +       * including serveral following pages are already in holes,
 +       * the extra check can be skipped */
 +      if (offset == orig_start) {
 +              /* after truncate page, check hole again */
 +              len = offset + len - lockstart;
 +              offset = lockstart;
 +              ret = find_first_non_hole(inode, &offset, &len);
 +              if (ret < 0)
 +                      goto out_only_mutex;
 +              if (ret && !len) {
 +                      ret = 0;
 +                      goto out_only_mutex;
 +              }
 +              lockstart = offset;
 +      }
 +
 +      /* Check the tail unaligned part is in a hole */
 +      tail_start = lockend + 1;
 +      tail_len = offset + len - tail_start;
 +      if (tail_len) {
 +              ret = find_first_non_hole(inode, &tail_start, &tail_len);
 +              if (unlikely(ret < 0))
 +                      goto out_only_mutex;
 +              if (!ret) {
 +                      /* zero the front end of the last page */
 +                      if (tail_start + tail_len < ino_size) {
 +                              ret = btrfs_truncate_page(inode,
 +                                              tail_start + tail_len, 0, 1);
 +                              if (ret)
 +                                      goto out_only_mutex;
 +                              }
                }
        }
  
                if ((!ordered ||
                    (ordered->file_offset + ordered->len <= lockstart ||
                     ordered->file_offset > lockend)) &&
 -                   !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
 -                                   lockend, EXTENT_UPTODATE, 0,
 -                                   cached_state)) {
 +                   !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
                        if (ordered)
                                btrfs_put_ordered_extent(ordered);
                        break;
        BUG_ON(ret);
        trans->block_rsv = rsv;
  
 +      cur_offset = lockstart;
 +      len = lockend - cur_offset;
        while (cur_offset < lockend) {
                ret = __btrfs_drop_extents(trans, root, inode, path,
                                           cur_offset, lockend + 1,
                                              rsv, min_size);
                BUG_ON(ret);    /* shouldn't happen */
                trans->block_rsv = rsv;
 +
 +              ret = find_first_non_hole(inode, &cur_offset, &len);
 +              if (unlikely(ret < 0))
 +                      break;
 +              if (ret && !len) {
 +                      ret = 0;
 +                      break;
 +              }
        }
  
        if (ret) {
        }
  
        trans->block_rsv = &root->fs_info->trans_block_rsv;
 -      if (cur_offset < ino_size) {
 +      /*
 +       * Don't insert file hole extent item if it's for a range beyond eof
 +       * (because it's useless) or if it represents a 0 bytes range (when
 +       * cur_offset == drop_end).
 +       */
 +      if (cur_offset < ino_size && cur_offset < drop_end) {
                ret = fill_holes(trans, inode, path, cur_offset, drop_end);
                if (ret) {
                        err = ret;
@@@ -2478,7 -2357,6 +2463,7 @@@ out_free
  out:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                             &cached_state, GFP_NOFS);
 +out_only_mutex:
        mutex_unlock(&inode->i_mutex);
        if (ret && !err)
                err = ret;
  
  const struct file_operations btrfs_file_operations = {
        .llseek         = btrfs_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = generic_file_aio_read,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = generic_file_read_iter,
        .splice_read    = generic_file_splice_read,
-       .aio_write      = btrfs_file_aio_write,
+       .write_iter     = btrfs_file_write_iter,
        .mmap           = btrfs_file_mmap,
        .open           = generic_file_open,
        .release        = btrfs_release_file,
diff --combined fs/btrfs/inode.c
index 7fa5f7fd7bc79259ed5a5e51131cbf6c8d07d919,c8386f1961f001586f5f1824e7e02d1b63bf4470..8925f66a14115c9d733182f2ec4d113be5be5edd
@@@ -125,7 -125,7 +125,7 @@@ static int btrfs_init_inode_security(st
   * the btree.  The caller should have done a btrfs_drop_extents so that
   * no overlapping inline items exist in the btree
   */
 -static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 +static int insert_inline_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_path *path, int extent_inserted,
                                struct btrfs_root *root, struct inode *inode,
                                u64 start, size_t size, size_t compressed_size,
@@@ -2678,7 -2678,6 +2678,7 @@@ static int btrfs_finish_ordered_io(stru
                trans = NULL;
                goto out_unlock;
        }
 +
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
  
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
@@@ -2948,15 -2947,14 +2948,15 @@@ void btrfs_orphan_commit_root(struct bt
        root->orphan_block_rsv = NULL;
        spin_unlock(&root->orphan_lock);
  
 -      if (root->orphan_item_inserted &&
 +      if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
            btrfs_root_refs(&root->root_item) > 0) {
                ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
                                            root->root_key.objectid);
                if (ret)
                        btrfs_abort_transaction(trans, root, ret);
                else
 -                      root->orphan_item_inserted = 0;
 +                      clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
 +                                &root->state);
        }
  
        if (block_rsv) {
@@@ -3273,8 -3271,7 +3273,8 @@@ int btrfs_orphan_cleanup(struct btrfs_r
                btrfs_block_rsv_release(root, root->orphan_block_rsv,
                                        (u64)-1);
  
 -      if (root->orphan_block_rsv || root->orphan_item_inserted) {
 +      if (root->orphan_block_rsv ||
 +          test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
                trans = btrfs_join_transaction(root);
                if (!IS_ERR(trans))
                        btrfs_end_transaction(trans, root);
@@@ -3476,7 -3473,7 +3476,7 @@@ cache_acl
                ret = btrfs_load_inode_props(inode, path);
                if (ret)
                        btrfs_err(root->fs_info,
 -                                "error loading props for ino %llu (root %llu): %d\n",
 +                                "error loading props for ino %llu (root %llu): %d",
                                  btrfs_ino(inode),
                                  root->root_key.objectid, ret);
        }
@@@ -4001,8 -3998,7 +4001,8 @@@ int btrfs_truncate_inode_items(struct b
         * not block aligned since we will be keeping the last block of the
         * extent just the way it is.
         */
 -      if (root->ref_cows || root == root->fs_info->tree_root)
 +      if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
 +          root == root->fs_info->tree_root)
                btrfs_drop_extent_cache(inode, ALIGN(new_size,
                                        root->sectorsize), (u64)-1, 0);
  
@@@ -4095,9 -4091,7 +4095,9 @@@ search_again
                                                         extent_num_bytes);
                                num_dec = (orig_num_bytes -
                                           extent_num_bytes);
 -                              if (root->ref_cows && extent_start != 0)
 +                              if (test_bit(BTRFS_ROOT_REF_COWS,
 +                                           &root->state) &&
 +                                  extent_start != 0)
                                        inode_sub_bytes(inode, num_dec);
                                btrfs_mark_buffer_dirty(leaf);
                        } else {
                                num_dec = btrfs_file_extent_num_bytes(leaf, fi);
                                if (extent_start != 0) {
                                        found_extent = 1;
 -                                      if (root->ref_cows)
 +                                      if (test_bit(BTRFS_ROOT_REF_COWS,
 +                                                   &root->state))
                                                inode_sub_bytes(inode, num_dec);
                                }
                        }
                            btrfs_file_extent_other_encoding(leaf, fi) == 0) {
                                u32 size = new_size - found_key.offset;
  
 -                              if (root->ref_cows) {
 +                              if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
                                        inode_sub_bytes(inode, item_end + 1 -
                                                        new_size);
 -                              }
  
                                /*
                                 * update the ram bytes to properly reflect
                                size =
                                    btrfs_file_extent_calc_inline_size(size);
                                btrfs_truncate_item(root, path, size, 1);
 -                      } else if (root->ref_cows) {
 +                      } else if (test_bit(BTRFS_ROOT_REF_COWS,
 +                                          &root->state)) {
                                inode_sub_bytes(inode, item_end + 1 -
                                                found_key.offset);
                        }
@@@ -4162,9 -4155,8 +4162,9 @@@ delete
                } else {
                        break;
                }
 -              if (found_extent && (root->ref_cows ||
 -                                   root == root->fs_info->tree_root)) {
 +              if (found_extent &&
 +                  (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
 +                   root == root->fs_info->tree_root)) {
                        btrfs_set_path_blocking(path);
                        ret = btrfs_free_extent(trans, root, extent_start,
                                                extent_num_bytes, 0,
@@@ -5176,7 -5168,8 +5176,7 @@@ static int btrfs_dentry_delete(const st
  
  static void btrfs_dentry_release(struct dentry *dentry)
  {
 -      if (dentry->d_fsdata)
 -              kfree(dentry->d_fsdata);
 +      kfree(dentry->d_fsdata);
  }
  
  static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
@@@ -5560,7 -5553,6 +5560,7 @@@ static struct inode *btrfs_new_inode(st
        struct btrfs_inode_ref *ref;
        struct btrfs_key key[2];
        u32 sizes[2];
 +      int nitems = name ? 2 : 1;
        unsigned long ptr;
        int ret;
  
         */
        inode->i_ino = objectid;
  
 -      if (dir) {
 +      if (dir && name) {
                trace_btrfs_inode_request(dir);
  
                ret = btrfs_set_inode_index(dir, index);
                        iput(inode);
                        return ERR_PTR(ret);
                }
 +      } else if (dir) {
 +              *index = 0;
        }
        /*
         * index_cnt is ignored for everything but a dir,
        btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
        key[0].offset = 0;
  
 -      /*
 -       * Start new inodes with an inode_ref. This is slightly more
 -       * efficient for small numbers of hard links since they will
 -       * be packed into one item. Extended refs will kick in if we
 -       * add more hard links than can fit in the ref item.
 -       */
 -      key[1].objectid = objectid;
 -      btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
 -      key[1].offset = ref_objectid;
 -
        sizes[0] = sizeof(struct btrfs_inode_item);
 -      sizes[1] = name_len + sizeof(*ref);
 +
 +      if (name) {
 +              /*
 +               * Start new inodes with an inode_ref. This is slightly more
 +               * efficient for small numbers of hard links since they will
 +               * be packed into one item. Extended refs will kick in if we
 +               * add more hard links than can fit in the ref item.
 +               */
 +              key[1].objectid = objectid;
 +              btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
 +              key[1].offset = ref_objectid;
 +
 +              sizes[1] = name_len + sizeof(*ref);
 +      }
  
        path->leave_spinning = 1;
 -      ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
 +      ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
        if (ret != 0)
                goto fail;
  
                             sizeof(*inode_item));
        fill_inode_item(trans, path->nodes[0], inode_item, inode);
  
 -      ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
 -                           struct btrfs_inode_ref);
 -      btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
 -      btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
 -      ptr = (unsigned long)(ref + 1);
 -      write_extent_buffer(path->nodes[0], name, ptr, name_len);
 +      if (name) {
 +              ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
 +                                   struct btrfs_inode_ref);
 +              btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
 +              btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
 +              ptr = (unsigned long)(ref + 1);
 +              write_extent_buffer(path->nodes[0], name, ptr, name_len);
 +      }
  
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
  
        return inode;
  fail:
 -      if (dir)
 +      if (dir && name)
                BTRFS_I(dir)->index_cnt--;
        btrfs_free_path(path);
        iput(inode);
@@@ -5973,15 -5958,6 +5973,15 @@@ static int btrfs_link(struct dentry *ol
                err = btrfs_update_inode(trans, root, inode);
                if (err)
                        goto fail;
 +              if (inode->i_nlink == 1) {
 +                      /*
 +                       * If new hard link count is 1, it's a file created
 +                       * with open(2) O_TMPFILE flag.
 +                       */
 +                      err = btrfs_orphan_del(trans, inode);
 +                      if (err)
 +                              goto fail;
 +              }
                d_instantiate(dentry, inode);
                btrfs_log_new_name(trans, inode, NULL, parent);
        }
@@@ -6110,8 -6086,16 +6110,8 @@@ static noinline int uncompress_inline(s
        max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
        ret = btrfs_decompress(compress_type, tmp, page,
                               extent_offset, inline_size, max_size);
 -      if (ret) {
 -              char *kaddr = kmap_atomic(page);
 -              unsigned long copy_size = min_t(u64,
 -                                PAGE_CACHE_SIZE - pg_offset,
 -                                max_size - extent_offset);
 -              memset(kaddr + pg_offset, 0, copy_size);
 -              kunmap_atomic(kaddr);
 -      }
        kfree(tmp);
 -      return 0;
 +      return ret;
  }
  
  /*
@@@ -6129,6 -6113,7 +6129,6 @@@ struct extent_map *btrfs_get_extent(str
  {
        int ret;
        int err = 0;
 -      u64 bytenr;
        u64 extent_start = 0;
        u64 extent_end = 0;
        u64 objectid = btrfs_ino(inode);
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_trans_handle *trans = NULL;
 -      int compress_type;
 +      const bool new_inline = !page || create;
  
  again:
        read_lock(&em_tree->lock);
  
        found_type = btrfs_file_extent_type(leaf, item);
        extent_start = found_key.offset;
 -      compress_type = btrfs_file_extent_compression(leaf, item);
        if (found_type == BTRFS_FILE_EXTENT_REG ||
            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
                extent_end = extent_start +
@@@ -6250,10 -6236,32 +6250,10 @@@ next
                goto not_found_em;
        }
  
 -      em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
 +      btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
 +
        if (found_type == BTRFS_FILE_EXTENT_REG ||
            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 -              em->start = extent_start;
 -              em->len = extent_end - extent_start;
 -              em->orig_start = extent_start -
 -                               btrfs_file_extent_offset(leaf, item);
 -              em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
 -                                                                    item);
 -              bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
 -              if (bytenr == 0) {
 -                      em->block_start = EXTENT_MAP_HOLE;
 -                      goto insert;
 -              }
 -              if (compress_type != BTRFS_COMPRESS_NONE) {
 -                      set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 -                      em->compress_type = compress_type;
 -                      em->block_start = bytenr;
 -                      em->block_len = em->orig_block_len;
 -              } else {
 -                      bytenr += btrfs_file_extent_offset(leaf, item);
 -                      em->block_start = bytenr;
 -                      em->block_len = em->len;
 -                      if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
 -                              set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
 -              }
                goto insert;
        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
                unsigned long ptr;
                size_t extent_offset;
                size_t copy_size;
  
 -              em->block_start = EXTENT_MAP_INLINE;
 -              if (!page || create) {
 -                      em->start = extent_start;
 -                      em->len = extent_end - extent_start;
 +              if (new_inline)
                        goto out;
 -              }
  
                size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
                extent_offset = page_offset(page) + pg_offset - extent_start;
                em->len = ALIGN(copy_size, root->sectorsize);
                em->orig_block_len = em->len;
                em->orig_start = em->start;
 -              if (compress_type) {
 -                      set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 -                      em->compress_type = compress_type;
 -              }
                ptr = btrfs_file_extent_inline_start(item) + extent_offset;
                if (create == 0 && !PageUptodate(page)) {
                        if (btrfs_file_extent_compression(leaf, item) !=
                                ret = uncompress_inline(path, inode, page,
                                                        pg_offset,
                                                        extent_offset, item);
 -                              BUG_ON(ret); /* -ENOMEM */
 +                              if (ret) {
 +                                      err = ret;
 +                                      goto out;
 +                              }
                        } else {
                                map = kmap(page);
                                read_extent_buffer(leaf, map + pg_offset, ptr,
                set_extent_uptodate(io_tree, em->start,
                                    extent_map_end(em) - 1, NULL, GFP_NOFS);
                goto insert;
 -      } else {
 -              WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
        }
  not_found:
        em->start = start;
        return ret;
  }
  
 +bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
 +{
 +      struct radix_tree_root *root = &inode->i_mapping->page_tree;
 +      int found = false;
 +      void **pagep = NULL;
 +      struct page *page = NULL;
 +      int start_idx;
 +      int end_idx;
 +
 +      start_idx = start >> PAGE_CACHE_SHIFT;
 +
 +      /*
 +       * end is the last byte in the last page.  end == start is legal
 +       */
 +      end_idx = end >> PAGE_CACHE_SHIFT;
 +
 +      rcu_read_lock();
 +
 +      /* Most of the code in this while loop is lifted from
 +       * find_get_page.  It's been modified to begin searching from a
 +       * page and return just the first page found in that range.  If the
 +       * found idx is less than or equal to the end idx then we know that
 +       * a page exists.  If no pages are found or if those pages are
 +       * outside of the range then we're fine (yay!) */
 +      while (page == NULL &&
 +             radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
 +              page = radix_tree_deref_slot(pagep);
 +              if (unlikely(!page))
 +                      break;
 +
 +              if (radix_tree_exception(page)) {
 +                      if (radix_tree_deref_retry(page)) {
 +                              page = NULL;
 +                              continue;
 +                      }
 +                      /*
 +                       * Otherwise, shmem/tmpfs must be storing a swap entry
 +                       * here as an exceptional entry: so return it without
 +                       * attempting to raise page count.
 +                       */
 +                      page = NULL;
 +                      break; /* TODO: Is this relevant for this use case? */
 +              }
 +
 +              if (!page_cache_get_speculative(page)) {
 +                      page = NULL;
 +                      continue;
 +              }
 +
 +              /*
 +               * Has the page moved?
 +               * This is part of the lockless pagecache protocol. See
 +               * include/linux/pagemap.h for details.
 +               */
 +              if (unlikely(page != *pagep)) {
 +                      page_cache_release(page);
 +                      page = NULL;
 +              }
 +      }
 +
 +      if (page) {
 +              if (page->index <= end_idx)
 +                      found = true;
 +              page_cache_release(page);
 +      }
 +
 +      rcu_read_unlock();
 +      return found;
 +}
 +
  static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                              struct extent_state **cached_state, int writing)
  {
                 * invalidate needs to happen so that reads after a write do not
                 * get stale data.
                 */
 -              if (!ordered && (!writing ||
 -                  !test_range_bit(&BTRFS_I(inode)->io_tree,
 -                                  lockstart, lockend, EXTENT_UPTODATE, 0,
 -                                  *cached_state)))
 +              if (!ordered &&
 +                  (!writing ||
 +                   !btrfs_page_exists_in_range(inode, lockstart, lockend)))
                        break;
  
                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
@@@ -7180,7 -7126,7 +7180,7 @@@ static void btrfs_end_dio_bio(struct bi
                 * before atomic variable goto zero, we must make sure
                 * dip->errors is perceived to be set.
                 */
 -              smp_mb__before_atomic_dec();
 +              smp_mb__before_atomic();
        }
  
        /* if there are more bios still pending for this dio, just exit */
@@@ -7360,7 -7306,7 +7360,7 @@@ out_err
         * before atomic variable goto zero, we must
         * make sure dip->errors is perceived to be set.
         */
 -      smp_mb__before_atomic_dec();
 +      smp_mb__before_atomic();
        if (atomic_dec_and_test(&dip->pending_bios))
                bio_io_error(dip->orig_bio);
  
@@@ -7445,39 -7391,30 +7445,30 @@@ free_ordered
  }
  
  static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
-                       const struct iovec *iov, loff_t offset,
-                       unsigned long nr_segs)
+                       const struct iov_iter *iter, loff_t offset)
  {
        int seg;
        int i;
-       size_t size;
-       unsigned long addr;
        unsigned blocksize_mask = root->sectorsize - 1;
        ssize_t retval = -EINVAL;
-       loff_t end = offset;
  
        if (offset & blocksize_mask)
                goto out;
  
-       /* Check the memory alignment.  Blocks cannot straddle pages */
-       for (seg = 0; seg < nr_segs; seg++) {
-               addr = (unsigned long)iov[seg].iov_base;
-               size = iov[seg].iov_len;
-               end += size;
-               if ((addr & blocksize_mask) || (size & blocksize_mask))
-                       goto out;
-               /* If this is a write we don't need to check anymore */
-               if (rw & WRITE)
-                       continue;
+       if (iov_iter_alignment(iter) & blocksize_mask)
+               goto out;
  
-               /*
-                * Check to make sure we don't have duplicate iov_base's in this
-                * iovec, if so return EINVAL, otherwise we'll get csum errors
-                * when reading back.
-                */
-               for (i = seg + 1; i < nr_segs; i++) {
-                       if (iov[seg].iov_base == iov[i].iov_base)
+       /* If this is a write we don't need to check anymore */
+       if (rw & WRITE)
+               return 0;
+       /*
+        * Check to make sure we don't have duplicate iov_base's in this
+        * iovec, if so return EINVAL, otherwise we'll get csum errors
+        * when reading back.
+        */
+       for (seg = 0; seg < iter->nr_segs; seg++) {
+               for (i = seg + 1; i < iter->nr_segs; i++) {
+                       if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
                                goto out;
                }
        }
@@@ -7487,8 -7424,7 +7478,7 @@@ out
  }
  
  static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
-                       const struct iovec *iov, loff_t offset,
-                       unsigned long nr_segs)
+                       struct iov_iter *iter, loff_t offset)
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        bool relock = false;
        ssize_t ret;
  
-       if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
-                           offset, nr_segs))
+       if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
                return 0;
  
        atomic_inc(&inode->i_dio_count);
 -      smp_mb__after_atomic_inc();
 +      smp_mb__after_atomic();
  
        /*
         * The generic stuff only does filemap_write_and_wait_range, which
         * we need to flush the dirty pages again to make absolutely sure
         * that any outstanding dirty pages are on disk.
         */
-       count = iov_length(iov, nr_segs);
+       count = iov_iter_count(iter);
        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
                     &BTRFS_I(inode)->runtime_flags))
                filemap_fdatawrite_range(inode->i_mapping, offset, count);
  
        ret = __blockdev_direct_IO(rw, iocb, inode,
                        BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
-                       iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+                       iter, offset, btrfs_get_blocks_direct, NULL,
                        btrfs_submit_direct, flags);
        if (rw & WRITE) {
                if (ret < 0 && ret != -EIOCBQUEUED)
@@@ -8046,7 -7981,7 +8035,7 @@@ int btrfs_create_subvol_root(struct btr
        err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
        if (err)
                btrfs_err(new_root->fs_info,
 -                        "error inheriting subvolume %llu properties: %d\n",
 +                        "error inheriting subvolume %llu properties: %d",
                          new_root->root_key.objectid, err);
  
        err = btrfs_update_inode(trans, new_root, inode);
@@@ -8365,7 -8300,7 +8354,7 @@@ static int btrfs_rename(struct inode *o
        BTRFS_I(old_inode)->dir_index = 0ULL;
        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
                /* force full log commit if subvolume involved. */
 -              root->fs_info->last_trans_log_full_commit = trans->transid;
 +              btrfs_set_log_full_commit(root->fs_info, trans);
        } else {
                ret = btrfs_insert_inode_ref(trans, dest,
                                             new_dentry->d_name.name,
@@@ -8943,66 -8878,6 +8932,66 @@@ static int btrfs_permission(struct inod
        return generic_permission(inode, mask);
  }
  
 +static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 +{
 +      struct btrfs_trans_handle *trans;
 +      struct btrfs_root *root = BTRFS_I(dir)->root;
 +      struct inode *inode = NULL;
 +      u64 objectid;
 +      u64 index;
 +      int ret = 0;
 +
 +      /*
 +       * 5 units required for adding orphan entry
 +       */
 +      trans = btrfs_start_transaction(root, 5);
 +      if (IS_ERR(trans))
 +              return PTR_ERR(trans);
 +
 +      ret = btrfs_find_free_ino(root, &objectid);
 +      if (ret)
 +              goto out;
 +
 +      inode = btrfs_new_inode(trans, root, dir, NULL, 0,
 +                              btrfs_ino(dir), objectid, mode, &index);
 +      if (IS_ERR(inode)) {
 +              ret = PTR_ERR(inode);
 +              inode = NULL;
 +              goto out;
 +      }
 +
 +      ret = btrfs_init_inode_security(trans, inode, dir, NULL);
 +      if (ret)
 +              goto out;
 +
 +      ret = btrfs_update_inode(trans, root, inode);
 +      if (ret)
 +              goto out;
 +
 +      inode->i_fop = &btrfs_file_operations;
 +      inode->i_op = &btrfs_file_inode_operations;
 +
 +      inode->i_mapping->a_ops = &btrfs_aops;
 +      inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 +      BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 +
 +      ret = btrfs_orphan_add(trans, inode);
 +      if (ret)
 +              goto out;
 +
 +      d_tmpfile(dentry, inode);
 +      mark_inode_dirty(inode);
 +
 +out:
 +      btrfs_end_transaction(trans, root);
 +      if (ret)
 +              iput(inode);
 +      btrfs_balance_delayed_items(root);
 +      btrfs_btree_balance_dirty(root);
 +
 +      return ret;
 +}
 +
  static const struct inode_operations btrfs_dir_inode_operations = {
        .getattr        = btrfs_getattr,
        .lookup         = btrfs_lookup,
        .get_acl        = btrfs_get_acl,
        .set_acl        = btrfs_set_acl,
        .update_time    = btrfs_update_time,
 +      .tmpfile        = btrfs_tmpfile,
  };
  static const struct inode_operations btrfs_dir_ro_inode_operations = {
        .lookup         = btrfs_lookup,
diff --combined fs/ceph/addr.c
index 65a30e817dd80ab9c7264ade89b9cae563998465,342ca5e423f9bb22ecbd1974778eb99c99d5ae6b..4f3f69079f362280379edf3b13c4766247c764fa
@@@ -694,7 -694,7 +694,7 @@@ static int ceph_writepages_start(struc
             (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
  
        if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
 -              pr_warning("writepage_start %p on forced umount\n", inode);
 +              pr_warn("writepage_start %p on forced umount\n", inode);
                return -EIO; /* we're in a forced umount, don't write! */
        }
        if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
@@@ -1187,8 -1187,8 +1187,8 @@@ static int ceph_write_end(struct file *
   * never get called.
   */
  static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
-                             const struct iovec *iov,
-                             loff_t pos, unsigned long nr_segs)
+                             struct iov_iter *iter,
+                             loff_t pos)
  {
        WARN_ON(1);
        return -EINVAL;
diff --combined fs/cifs/cifsfs.c
index 6aaa8112c538a73c82b15eaf8dd733abd21f39f0,496b520934e01adafd7a9a1d2ae90549ca008433..2c90d07c0b3aa3a6db836e0290fd0ecc2137b317
@@@ -87,6 -87,10 +87,6 @@@ extern mempool_t *cifs_mid_poolp
  
  struct workqueue_struct       *cifsiod_wq;
  
 -#ifdef CONFIG_CIFS_SMB2
 -__u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE];
 -#endif
 -
  /*
   * Bumps refcount for cifs super block.
   * Note that it should be only called if a referece to VFS super block is
@@@ -247,7 -251,11 +247,7 @@@ cifs_alloc_inode(struct super_block *sb
         * server, can not assume caching of file data or metadata.
         */
        cifs_set_oplock_level(cifs_inode, 0);
 -      cifs_inode->delete_pending = false;
 -      cifs_inode->invalid_mapping = false;
 -      clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cifs_inode->flags);
 -      clear_bit(CIFS_INODE_PENDING_WRITERS, &cifs_inode->flags);
 -      clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cifs_inode->flags);
 +      cifs_inode->flags = 0;
        spin_lock_init(&cifs_inode->writers_lock);
        cifs_inode->writers = 0;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
@@@ -294,7 -302,7 +294,7 @@@ cifs_show_address(struct seq_file *s, s
        struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
        struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
  
 -      seq_printf(s, ",addr=");
 +      seq_puts(s, ",addr=");
  
        switch (server->dstaddr.ss_family) {
        case AF_INET:
                        seq_printf(s, "%%%u", sa6->sin6_scope_id);
                break;
        default:
 -              seq_printf(s, "(unknown)");
 +              seq_puts(s, "(unknown)");
        }
  }
  
@@@ -316,45 -324,45 +316,45 @@@ cifs_show_security(struct seq_file *s, 
        if (ses->sectype == Unspecified)
                return;
  
 -      seq_printf(s, ",sec=");
 +      seq_puts(s, ",sec=");
  
        switch (ses->sectype) {
        case LANMAN:
 -              seq_printf(s, "lanman");
 +              seq_puts(s, "lanman");
                break;
        case NTLMv2:
 -              seq_printf(s, "ntlmv2");
 +              seq_puts(s, "ntlmv2");
                break;
        case NTLM:
 -              seq_printf(s, "ntlm");
 +              seq_puts(s, "ntlm");
                break;
        case Kerberos:
 -              seq_printf(s, "krb5");
 +              seq_puts(s, "krb5");
                break;
        case RawNTLMSSP:
 -              seq_printf(s, "ntlmssp");
 +              seq_puts(s, "ntlmssp");
                break;
        default:
                /* shouldn't ever happen */
 -              seq_printf(s, "unknown");
 +              seq_puts(s, "unknown");
                break;
        }
  
        if (ses->sign)
 -              seq_printf(s, "i");
 +              seq_puts(s, "i");
  }
  
  static void
  cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb)
  {
 -      seq_printf(s, ",cache=");
 +      seq_puts(s, ",cache=");
  
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
 -              seq_printf(s, "strict");
 +              seq_puts(s, "strict");
        else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
 -              seq_printf(s, "none");
 +              seq_puts(s, "none");
        else
 -              seq_printf(s, "loose");
 +              seq_puts(s, "loose");
  }
  
  static void
@@@ -387,7 -395,7 +387,7 @@@ cifs_show_options(struct seq_file *s, s
        cifs_show_cache_flavor(s, cifs_sb);
  
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
 -              seq_printf(s, ",multiuser");
 +              seq_puts(s, ",multiuser");
        else if (tcon->ses->user_name)
                seq_printf(s, ",username=%s", tcon->ses->user_name);
  
        seq_printf(s, ",uid=%u",
                   from_kuid_munged(&init_user_ns, cifs_sb->mnt_uid));
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
 -              seq_printf(s, ",forceuid");
 +              seq_puts(s, ",forceuid");
        else
 -              seq_printf(s, ",noforceuid");
 +              seq_puts(s, ",noforceuid");
  
        seq_printf(s, ",gid=%u",
                   from_kgid_munged(&init_user_ns, cifs_sb->mnt_gid));
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
 -              seq_printf(s, ",forcegid");
 +              seq_puts(s, ",forcegid");
        else
 -              seq_printf(s, ",noforcegid");
 +              seq_puts(s, ",noforcegid");
  
        cifs_show_address(s, tcon->ses->server);
  
        cifs_show_nls(s, cifs_sb->local_nls);
  
        if (tcon->seal)
 -              seq_printf(s, ",seal");
 +              seq_puts(s, ",seal");
        if (tcon->nocase)
 -              seq_printf(s, ",nocase");
 +              seq_puts(s, ",nocase");
        if (tcon->retry)
 -              seq_printf(s, ",hard");
 +              seq_puts(s, ",hard");
        if (tcon->unix_ext)
 -              seq_printf(s, ",unix");
 +              seq_puts(s, ",unix");
        else
 -              seq_printf(s, ",nounix");
 +              seq_puts(s, ",nounix");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
 -              seq_printf(s, ",posixpaths");
 +              seq_puts(s, ",posixpaths");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
 -              seq_printf(s, ",setuids");
 +              seq_puts(s, ",setuids");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
 -              seq_printf(s, ",serverino");
 +              seq_puts(s, ",serverino");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
 -              seq_printf(s, ",rwpidforward");
 +              seq_puts(s, ",rwpidforward");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL)
 -              seq_printf(s, ",forcemand");
 +              seq_puts(s, ",forcemand");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 -              seq_printf(s, ",nouser_xattr");
 +              seq_puts(s, ",nouser_xattr");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
 -              seq_printf(s, ",mapchars");
 +              seq_puts(s, ",mapchars");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
 -              seq_printf(s, ",sfu");
 +              seq_puts(s, ",sfu");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
 -              seq_printf(s, ",nobrl");
 +              seq_puts(s, ",nobrl");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
 -              seq_printf(s, ",cifsacl");
 +              seq_puts(s, ",cifsacl");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
 -              seq_printf(s, ",dynperm");
 +              seq_puts(s, ",dynperm");
        if (root->d_sb->s_flags & MS_POSIXACL)
 -              seq_printf(s, ",acl");
 +              seq_puts(s, ",acl");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
 -              seq_printf(s, ",mfsymlinks");
 +              seq_puts(s, ",mfsymlinks");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
 -              seq_printf(s, ",fsc");
 +              seq_puts(s, ",fsc");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)
 -              seq_printf(s, ",nostrictsync");
 +              seq_puts(s, ",nostrictsync");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
 -              seq_printf(s, ",noperm");
 +              seq_puts(s, ",noperm");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID)
                seq_printf(s, ",backupuid=%u",
                           from_kuid_munged(&init_user_ns,
@@@ -725,8 -733,7 +725,7 @@@ out_nls
        goto out;
  }
  
- static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                                  unsigned long nr_segs, loff_t pos)
+ static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  {
        struct inode *inode = file_inode(iocb->ki_filp);
        struct cifsInodeInfo *cinode = CIFS_I(inode);
        if (written)
                return written;
  
-       written = generic_file_aio_write(iocb, iov, nr_segs, pos);
+       written = generic_file_write_iter(iocb, from);
  
        if (CIFS_CACHE_WRITE(CIFS_I(inode)))
                goto out;
  
        rc = filemap_fdatawrite(inode->i_mapping);
        if (rc)
-               cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n",
+               cifs_dbg(FYI, "cifs_file_write_iter: %d rc on %p inode\n",
                         rc, inode);
  
  out:
@@@ -880,10 -887,10 +879,10 @@@ const struct inode_operations cifs_syml
  };
  
  const struct file_operations cifs_file_ops = {
-       .read = do_sync_read,
-       .write = do_sync_write,
-       .aio_read = generic_file_aio_read,
-       .aio_write = cifs_file_aio_write,
+       .read = new_sync_read,
+       .write = new_sync_write,
+       .read_iter = generic_file_read_iter,
+       .write_iter = cifs_file_write_iter,
        .open = cifs_open,
        .release = cifs_close,
        .lock = cifs_lock,
  };
  
  const struct file_operations cifs_file_strict_ops = {
-       .read = do_sync_read,
-       .write = do_sync_write,
-       .aio_read = cifs_strict_readv,
-       .aio_write = cifs_strict_writev,
+       .read = new_sync_read,
+       .write = new_sync_write,
+       .read_iter = cifs_strict_readv,
+       .write_iter = cifs_strict_writev,
        .open = cifs_open,
        .release = cifs_close,
        .lock = cifs_lock,
  
  const struct file_operations cifs_file_direct_ops = {
        /* BB reevaluate whether they can be done with directio, no cache */
-       .read = do_sync_read,
-       .write = do_sync_write,
-       .aio_read = cifs_user_readv,
-       .aio_write = cifs_user_writev,
+       .read = new_sync_read,
+       .write = new_sync_write,
+       .read_iter = cifs_user_readv,
+       .write_iter = cifs_user_writev,
        .open = cifs_open,
        .release = cifs_close,
        .lock = cifs_lock,
  };
  
  const struct file_operations cifs_file_nobrl_ops = {
-       .read = do_sync_read,
-       .write = do_sync_write,
-       .aio_read = generic_file_aio_read,
-       .aio_write = cifs_file_aio_write,
+       .read = new_sync_read,
+       .write = new_sync_write,
+       .read_iter = generic_file_read_iter,
+       .write_iter = cifs_file_write_iter,
        .open = cifs_open,
        .release = cifs_close,
        .fsync = cifs_fsync,
  };
  
  const struct file_operations cifs_file_strict_nobrl_ops = {
-       .read = do_sync_read,
-       .write = do_sync_write,
-       .aio_read = cifs_strict_readv,
-       .aio_write = cifs_strict_writev,
+       .read = new_sync_read,
+       .write = new_sync_write,
+       .read_iter = cifs_strict_readv,
+       .write_iter = cifs_strict_writev,
        .open = cifs_open,
        .release = cifs_close,
        .fsync = cifs_strict_fsync,
  
  const struct file_operations cifs_file_direct_nobrl_ops = {
        /* BB reevaluate whether they can be done with directio, no cache */
-       .read = do_sync_read,
-       .write = do_sync_write,
-       .aio_read = cifs_user_readv,
-       .aio_write = cifs_user_writev,
+       .read = new_sync_read,
+       .write = new_sync_write,
+       .read_iter = cifs_user_readv,
+       .write_iter = cifs_user_writev,
        .open = cifs_open,
        .release = cifs_close,
        .fsync = cifs_fsync,
@@@ -1184,6 -1191,10 +1183,6 @@@ init_cifs(void
        spin_lock_init(&cifs_file_list_lock);
        spin_lock_init(&GlobalMid_Lock);
  
 -#ifdef CONFIG_CIFS_SMB2
 -      get_random_bytes(cifs_client_guid, SMB2_CLIENT_GUID_SIZE);
 -#endif
 -
        if (cifs_max_pending < 2) {
                cifs_max_pending = 2;
                cifs_dbg(FYI, "cifs_max_pending set to min of 2\n");
diff --combined fs/cifs/cifsfs.h
index 8fe51166d6e3192bb8aadf2d86f5a0acf494622a,c9e91886f0cfd08d108662d60d3d38fdf26c11f9..70f178a7c759525a17fc758e0637bf1d0c941ead
  #ifndef _CIFSFS_H
  #define _CIFSFS_H
  
 +#include <linux/hash.h>
 +
  #define ROOT_I 2
  
  /*
   * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
 - * so that it will fit.
 + * so that it will fit. We use hash_64 to convert the value to 31 bits, and
 + * then add 1, to ensure that we don't end up with a 0 as the value.
   */
 +#if BITS_PER_LONG == 64
  static inline ino_t
  cifs_uniqueid_to_ino_t(u64 fileid)
  {
 -      ino_t ino = (ino_t) fileid;
 -      if (sizeof(ino_t) < sizeof(u64))
 -              ino ^= fileid >> (sizeof(u64)-sizeof(ino_t)) * 8;
 -      return ino;
 +      return (ino_t)fileid;
  }
 +#else
 +static inline ino_t
 +cifs_uniqueid_to_ino_t(u64 fileid)
 +{
 +      return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
 +}
 +#endif
  
  extern struct file_system_type cifs_fs_type;
  extern const struct address_space_operations cifs_addr_ops;
@@@ -75,8 -67,6 +75,8 @@@ extern int cifs_revalidate_dentry_attr(
  extern int cifs_revalidate_file(struct file *filp);
  extern int cifs_revalidate_dentry(struct dentry *);
  extern int cifs_invalidate_mapping(struct inode *inode);
 +extern int cifs_revalidate_mapping(struct inode *inode);
 +extern int cifs_zap_mapping(struct inode *inode);
  extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
  extern int cifs_setattr(struct dentry *, struct iattr *);
  
@@@ -95,14 -85,10 +95,10 @@@ extern const struct file_operations cif
  extern int cifs_open(struct inode *inode, struct file *file);
  extern int cifs_close(struct inode *inode, struct file *file);
  extern int cifs_closedir(struct inode *inode, struct file *file);
- extern ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
-                              unsigned long nr_segs, loff_t pos);
- extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
-                                unsigned long nr_segs, loff_t pos);
- extern ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
-                               unsigned long nr_segs, loff_t pos);
- extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
-                                 unsigned long nr_segs, loff_t pos);
+ extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
+ extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
+ extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
+ extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
  extern int cifs_lock(struct file *, int, struct file_lock *);
  extern int cifs_fsync(struct file *, loff_t, loff_t, int);
  extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int);
@@@ -140,5 -126,5 +136,5 @@@ extern long cifs_ioctl(struct file *fil
  extern const struct export_operations cifs_export_ops;
  #endif /* CONFIG_CIFS_NFSD_EXPORT */
  
 -#define CIFS_VERSION   "2.02"
 +#define CIFS_VERSION   "2.03"
  #endif                                /* _CIFSFS_H */
diff --combined fs/cifs/file.c
index 208f56eca4bf4de164d8af873b0050ac4884c5ea,60e9b5fa22128fc7a127a5859f0c6070aae09c36..e90a1e9aa627642c9ccefd428319f43b3d379c2f
@@@ -335,7 -335,7 +335,7 @@@ cifs_new_fileinfo(struct cifs_fid *fid
        spin_unlock(&cifs_file_list_lock);
  
        if (fid->purge_cache)
 -              cifs_invalidate_mapping(inode);
 +              cifs_zap_mapping(inode);
  
        file->private_data = cfile;
        return cfile;
@@@ -392,7 -392,7 +392,7 @@@ void cifsFileInfo_put(struct cifsFileIn
                 * again and get at least level II oplock.
                 */
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
 -                      CIFS_I(inode)->invalid_mapping = true;
 +                      set_bit(CIFS_INO_INVALID_MAPPING, &cifsi->flags);
                cifs_set_oplock_level(cifsi, 0);
        }
        spin_unlock(&cifs_file_list_lock);
@@@ -1529,7 -1529,7 +1529,7 @@@ cifs_setlk(struct file *file, struct fi
                 */
                if (!CIFS_CACHE_WRITE(CIFS_I(inode)) &&
                                        CIFS_CACHE_READ(CIFS_I(inode))) {
 -                      cifs_invalidate_mapping(inode);
 +                      cifs_zap_mapping(inode);
                        cifs_dbg(FYI, "Set no oplock for inode=%p due to mand locks\n",
                                 inode);
                        CIFS_I(inode)->oplock = 0;
@@@ -2218,7 -2218,7 +2218,7 @@@ int cifs_strict_fsync(struct file *file
                 file->f_path.dentry->d_name.name, datasync);
  
        if (!CIFS_CACHE_READ(CIFS_I(inode))) {
 -              rc = cifs_invalidate_mapping(inode);
 +              rc = cifs_zap_mapping(inode);
                if (rc) {
                        cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc);
                        rc = 0; /* don't care about it in fsync */
@@@ -2385,14 -2385,12 +2385,12 @@@ cifs_uncached_retry_writev(struct cifs_
  }
  
  static ssize_t
- cifs_iovec_write(struct file *file, const struct iovec *iov,
-                unsigned long nr_segs, loff_t *poffset)
+ cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
  {
        unsigned long nr_pages, i;
        size_t bytes, copied, len, cur_len;
        ssize_t total_written = 0;
        loff_t offset;
-       struct iov_iter it;
        struct cifsFileInfo *open_file;
        struct cifs_tcon *tcon;
        struct cifs_sb_info *cifs_sb;
        int rc;
        pid_t pid;
  
-       len = iov_length(iov, nr_segs);
-       if (!len)
-               return 0;
+       len = iov_iter_count(from);
        rc = generic_write_checks(file, poffset, &len, 0);
        if (rc)
                return rc;
  
+       if (!len)
+               return 0;
+       iov_iter_truncate(from, len);
        INIT_LIST_HEAD(&wdata_list);
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        open_file = file->private_data;
        else
                pid = current->tgid;
  
-       iov_iter_init(&it, iov, nr_segs, len, 0);
        do {
                size_t save_len;
  
  
                save_len = cur_len;
                for (i = 0; i < nr_pages; i++) {
-                       bytes = min_t(const size_t, cur_len, PAGE_SIZE);
-                       copied = iov_iter_copy_from_user(wdata->pages[i], &it,
-                                                        0, bytes);
+                       bytes = min_t(size_t, cur_len, PAGE_SIZE);
+                       copied = copy_page_from_iter(wdata->pages[i], 0, bytes,
+                                                    from);
                        cur_len -= copied;
-                       iov_iter_advance(&it, copied);
                        /*
                         * If we didn't copy as much as we expected, then that
                         * may mean we trod into an unmapped area. Stop copying
@@@ -2546,11 -2544,11 +2544,11 @@@ restart_loop
        return total_written ? total_written : (ssize_t)rc;
  }
  
- ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
-                               unsigned long nr_segs, loff_t pos)
+ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
  {
        ssize_t written;
        struct inode *inode;
+       loff_t pos = iocb->ki_pos;
  
        inode = file_inode(iocb->ki_filp);
  
         * write request.
         */
  
-       written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos);
+       written = cifs_iovec_write(iocb->ki_filp, from, &pos);
        if (written > 0) {
 -              CIFS_I(inode)->invalid_mapping = true;
 +              set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(inode)->flags);
                iocb->ki_pos = pos;
        }
  
  }
  
  static ssize_t
- cifs_writev(struct kiocb *iocb, const struct iovec *iov,
-           unsigned long nr_segs, loff_t pos)
+ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
  {
        struct file *file = iocb->ki_filp;
        struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
        mutex_lock(&inode->i_mutex);
        if (file->f_flags & O_APPEND)
                lock_pos = i_size_read(inode);
-       if (!cifs_find_lock_conflict(cfile, lock_pos, iov_length(iov, nr_segs),
+       if (!cifs_find_lock_conflict(cfile, lock_pos, iov_iter_count(from),
                                     server->vals->exclusive_lock_type, NULL,
                                     CIFS_WRITE_OP)) {
-               rc = __generic_file_aio_write(iocb, iov, nr_segs);
+               rc = __generic_file_write_iter(iocb, from);
                mutex_unlock(&inode->i_mutex);
  
                if (rc > 0) {
  }
  
  ssize_t
- cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
-                  unsigned long nr_segs, loff_t pos)
+ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from)
  {
        struct inode *inode = file_inode(iocb->ki_filp);
        struct cifsInodeInfo *cinode = CIFS_I(inode);
                if (cap_unix(tcon->ses) &&
                (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
                  && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
-                       written = generic_file_aio_write(
-                                       iocb, iov, nr_segs, pos);
+                       written = generic_file_write_iter(iocb, from);
                        goto out;
                }
-               written = cifs_writev(iocb, iov, nr_segs, pos);
+               written = cifs_writev(iocb, from);
                goto out;
        }
        /*
         * affected pages because it may cause a error with mandatory locks on
         * these pages but not on the region from pos to ppos+len-1.
         */
-       written = cifs_user_writev(iocb, iov, nr_segs, pos);
+       written = cifs_user_writev(iocb, from);
        if (written > 0 && CIFS_CACHE_READ(cinode)) {
                /*
                 * Windows 7 server can delay breaking level2 oplock if a write
                 * request comes - break it on the client to prevent reading
                 * an old data.
                 */
 -              cifs_invalidate_mapping(inode);
 +              cifs_zap_mapping(inode);
                cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n",
                         inode);
                cinode->oplock = 0;
@@@ -2831,32 -2826,25 +2826,25 @@@ cifs_uncached_read_into_pages(struct TC
        return total_read > 0 ? total_read : result;
  }
  
- ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
-                              unsigned long nr_segs, loff_t pos)
+ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
  {
        struct file *file = iocb->ki_filp;
        ssize_t rc;
        size_t len, cur_len;
        ssize_t total_read = 0;
-       loff_t offset = pos;
+       loff_t offset = iocb->ki_pos;
        unsigned int npages;
        struct cifs_sb_info *cifs_sb;
        struct cifs_tcon *tcon;
        struct cifsFileInfo *open_file;
        struct cifs_readdata *rdata, *tmp;
        struct list_head rdata_list;
-       struct iov_iter to;
        pid_t pid;
  
-       if (!nr_segs)
-               return 0;
-       len = iov_length(iov, nr_segs);
+       len = iov_iter_count(to);
        if (!len)
                return 0;
  
-       iov_iter_init(&to, iov, nr_segs, len, 0);
        INIT_LIST_HEAD(&rdata_list);
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        open_file = file->private_data;
@@@ -2914,7 -2902,7 +2902,7 @@@ error
        if (!list_empty(&rdata_list))
                rc = 0;
  
-       len = iov_iter_count(&to);
+       len = iov_iter_count(to);
        /* the loop below should proceed in the order of increasing offsets */
        list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
        again:
                                        goto again;
                                }
                        } else {
-                               rc = cifs_readdata_to_iov(rdata, &to);
+                               rc = cifs_readdata_to_iov(rdata, to);
                        }
  
                }
                kref_put(&rdata->refcount, cifs_uncached_readdata_release);
        }
  
-       total_read = len - iov_iter_count(&to);
+       total_read = len - iov_iter_count(to);
  
        cifs_stats_bytes_read(tcon, total_read);
  
                rc = 0;
  
        if (total_read) {
-               iocb->ki_pos = pos + total_read;
+               iocb->ki_pos += total_read;
                return total_read;
        }
        return rc;
  }
  
  ssize_t
- cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
-                 unsigned long nr_segs, loff_t pos)
+ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
  {
        struct inode *inode = file_inode(iocb->ki_filp);
        struct cifsInodeInfo *cinode = CIFS_I(inode);
         * pos+len-1.
         */
        if (!CIFS_CACHE_READ(cinode))
-               return cifs_user_readv(iocb, iov, nr_segs, pos);
+               return cifs_user_readv(iocb, to);
  
        if (cap_unix(tcon->ses) &&
            (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
            ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
-               return generic_file_aio_read(iocb, iov, nr_segs, pos);
+               return generic_file_read_iter(iocb, to);
  
        /*
         * We need to hold the sem to be sure nobody modifies lock list
         * with a brlock that prevents reading.
         */
        down_read(&cinode->lock_sem);
-       if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
+       if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to),
                                     tcon->ses->server->vals->shared_lock_type,
                                     NULL, CIFS_READ_OP))
-               rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
+               rc = generic_file_read_iter(iocb, to);
        up_read(&cinode->lock_sem);
        return rc;
  }
@@@ -3112,7 -3099,7 +3099,7 @@@ int cifs_file_strict_mmap(struct file *
        xid = get_xid();
  
        if (!CIFS_CACHE_READ(CIFS_I(inode))) {
 -              rc = cifs_invalidate_mapping(inode);
 +              rc = cifs_zap_mapping(inode);
                if (rc)
                        return rc;
        }
@@@ -3670,7 -3657,7 +3657,7 @@@ void cifs_oplock_break(struct work_stru
                if (!CIFS_CACHE_READ(cinode)) {
                        rc = filemap_fdatawait(inode->i_mapping);
                        mapping_set_error(inode->i_mapping, rc);
 -                      cifs_invalidate_mapping(inode);
 +                      cifs_zap_mapping(inode);
                }
                cifs_dbg(FYI, "Oplock flush inode %p rc %d\n", inode, rc);
        }
   * Direct IO is not yet supported in the cached mode. 
   */
  static ssize_t
- cifs_direct_io(int rw, struct kiocb *iocb, const struct iovec *iov,
-                loff_t pos, unsigned long nr_segs)
+ cifs_direct_io(int rw, struct kiocb *iocb, struct iov_iter *iter,
+                loff_t pos)
  {
          /*
           * FIXME
diff --combined fs/dcache.c
index 1792d6075b4f80ced75e04d17c02b8d362b51aac,e99c6f529ba8bbd307bd5024cb723056b74afe17..06f65857a855725247c1190d243c0e19cccd8570
@@@ -150,7 -150,7 +150,7 @@@ static long get_nr_dentry_unused(void
        return sum < 0 ? 0 : sum;
  }
  
 -int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
 +int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer,
                   size_t *lenp, loff_t *ppos)
  {
        dentry_stat.nr_dentry = get_nr_dentry();
@@@ -532,10 -532,12 +532,12 @@@ static inline struct dentry *lock_paren
        struct dentry *parent = dentry->d_parent;
        if (IS_ROOT(dentry))
                return NULL;
+       if (unlikely((int)dentry->d_lockref.count < 0))
+               return NULL;
        if (likely(spin_trylock(&parent->d_lock)))
                return parent;
-       spin_unlock(&dentry->d_lock);
        rcu_read_lock();
+       spin_unlock(&dentry->d_lock);
  again:
        parent = ACCESS_ONCE(dentry->d_parent);
        spin_lock(&parent->d_lock);
diff --combined fs/ext3/inode.c
index 695abe738a2409f4c32f4ef7d5749757d98b6f15,4d32133a76c4bbfd0393528f8ad463bf239bf354..2c6ccc49ba279cacf77fe6609fe44a50b970898c
@@@ -1716,17 -1716,17 +1716,17 @@@ static int ext3_journalled_writepage(st
        WARN_ON_ONCE(IS_RDONLY(inode) &&
                     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
  
 -      if (ext3_journal_current_handle())
 -              goto no_write;
 -
        trace_ext3_journalled_writepage(page);
 -      handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
 -      if (IS_ERR(handle)) {
 -              ret = PTR_ERR(handle);
 -              goto no_write;
 -      }
 -
        if (!page_has_buffers(page) || PageChecked(page)) {
 +              if (ext3_journal_current_handle())
 +                      goto no_write;
 +
 +              handle = ext3_journal_start(inode,
 +                                          ext3_writepage_trans_blocks(inode));
 +              if (IS_ERR(handle)) {
 +                      ret = PTR_ERR(handle);
 +                      goto no_write;
 +              }
                /*
                 * It's mmapped pagecache.  Add buffers and journal it.  There
                 * doesn't seem much point in redirtying the page here.
                atomic_set(&EXT3_I(inode)->i_datasync_tid,
                           handle->h_transaction->t_tid);
                unlock_page(page);
 +              err = ext3_journal_stop(handle);
 +              if (!ret)
 +                      ret = err;
        } else {
                /*
 -               * It may be a page full of checkpoint-mode buffers.  We don't
 -               * really know unless we go poke around in the buffer_heads.
 -               * But block_write_full_page will do the right thing.
 +               * It is a page full of checkpoint-mode buffers. Go and write
 +               * them. They should have been already mapped when they went
 +               * to the journal so provide NULL get_block function to catch
 +               * errors.
                 */
 -              ret = block_write_full_page(page, ext3_get_block, wbc);
 +              ret = block_write_full_page(page, NULL, wbc);
        }
 -      err = ext3_journal_stop(handle);
 -      if (!ret)
 -              ret = err;
  out:
        return ret;
  
@@@ -1821,8 -1820,7 +1821,7 @@@ static int ext3_releasepage(struct pag
   * VFS code falls back into buffered path in that case so we are safe.
   */
  static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
-                       const struct iovec *iov, loff_t offset,
-                       unsigned long nr_segs)
+                       struct iov_iter *iter, loff_t offset)
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        handle_t *handle;
        ssize_t ret;
        int orphan = 0;
-       size_t count = iov_length(iov, nr_segs);
+       size_t count = iov_iter_count(iter);
        int retries = 0;
  
-       trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
+       trace_ext3_direct_IO_enter(inode, offset, count, rw);
  
        if (rw == WRITE) {
                loff_t final_size = offset + count;
        }
  
  retry:
-       ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-                                ext3_get_block);
+       ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block);
        /*
         * In case of error extending write may have instantiated a few
         * blocks outside i_size. Trim these off again.
         */
        if (unlikely((rw & WRITE) && ret < 0)) {
                loff_t isize = i_size_read(inode);
-               loff_t end = offset + iov_length(iov, nr_segs);
+               loff_t end = offset + count;
  
                if (end > isize)
                        ext3_truncate_failed_direct_write(inode);
                        ret = err;
        }
  out:
-       trace_ext3_direct_IO_exit(inode, offset,
-                               iov_length(iov, nr_segs), rw, ret);
+       trace_ext3_direct_IO_exit(inode, offset, count, rw, ret);
        return ret;
  }
  
diff --combined fs/ext4/ext4.h
index 1479e2ae00d28e83e8d1c175752a61b59828cd55,eb37d76bf9116f02dc005a837655062a9099aa26..7cc5a0e23688e1a2ce071dcb646b725a266ff890
@@@ -875,8 -875,6 +875,8 @@@ struct ext4_inode_info 
        struct inode vfs_inode;
        struct jbd2_inode *jinode;
  
 +      spinlock_t i_raw_lock;  /* protects updates to the raw inode */
 +
        /*
         * File creation time. Its function is same as that of
         * struct timespec i_{a,c,m}time in the generic inode.
@@@ -1160,8 -1158,7 +1160,8 @@@ struct ext4_super_block 
        __le32  s_usr_quota_inum;       /* inode for tracking user quota */
        __le32  s_grp_quota_inum;       /* inode for tracking group quota */
        __le32  s_overhead_clusters;    /* overhead blocks/clusters in fs */
 -      __le32  s_reserved[108];        /* Padding to the end of the block */
 +      __le32  s_backup_bgs[2];        /* groups with sparse_super2 SBs */
 +      __le32  s_reserved[106];        /* Padding to the end of the block */
        __le32  s_checksum;             /* crc32c(superblock) */
  };
  
@@@ -1507,7 -1504,6 +1507,7 @@@ static inline void ext4_clear_state_fla
  #define EXT4_FEATURE_COMPAT_EXT_ATTR          0x0008
  #define EXT4_FEATURE_COMPAT_RESIZE_INODE      0x0010
  #define EXT4_FEATURE_COMPAT_DIR_INDEX         0x0020
 +#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2     0x0200
  
  #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER   0x0001
  #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE     0x0002
@@@ -1956,6 -1952,10 +1956,6 @@@ extern void ext4_get_group_no_and_offse
  extern ext4_group_t ext4_get_group_number(struct super_block *sb,
                                          ext4_fsblk_t block);
  
 -extern void ext4_validate_block_bitmap(struct super_block *sb,
 -                                     struct ext4_group_desc *desc,
 -                                     ext4_group_t block_group,
 -                                     struct buffer_head *bh);
  extern unsigned int ext4_block_group(struct super_block *sb,
                        ext4_fsblk_t blocknr);
  extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
@@@ -1984,9 -1984,16 +1984,9 @@@ extern int ext4_wait_block_bitmap(struc
                                  struct buffer_head *bh);
  extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
                                                  ext4_group_t block_group);
 -extern void ext4_init_block_bitmap(struct super_block *sb,
 -                                 struct buffer_head *bh,
 -                                 ext4_group_t group,
 -                                 struct ext4_group_desc *desc);
  extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
                                              ext4_group_t block_group,
                                              struct ext4_group_desc *gdp);
 -extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
 -                                         ext4_group_t block_group,
 -                                         struct ext4_group_desc *gdp);
  ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
  
  /* dir.c */
@@@ -2129,6 -2136,8 +2129,6 @@@ extern int ext4_alloc_da_blocks(struct 
  extern void ext4_set_aops(struct inode *inode);
  extern int ext4_writepage_trans_blocks(struct inode *);
  extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 -extern int ext4_block_truncate_page(handle_t *handle,
 -              struct address_space *mapping, loff_t from);
  extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                             loff_t lstart, loff_t lend);
  extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
@@@ -2140,8 -2149,7 +2140,7 @@@ extern void ext4_da_update_reserve_spac
  extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
                                struct ext4_map_blocks *map, int flags);
  extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
-                               const struct iovec *iov, loff_t offset,
-                               unsigned long nr_segs);
+                               struct iov_iter *iter, loff_t offset);
  extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
  extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
  extern void ext4_ind_truncate(handle_t *, struct inode *inode);
@@@ -2188,6 -2196,8 +2187,6 @@@ extern int ext4_resize_fs(struct super_
  
  /* super.c */
  extern int ext4_calculate_overhead(struct super_block *sb);
 -extern int ext4_superblock_csum_verify(struct super_block *sb,
 -                                     struct ext4_super_block *es);
  extern void ext4_superblock_csum_set(struct super_block *sb);
  extern void *ext4_kvmalloc(size_t size, gfp_t flags);
  extern void *ext4_kvzalloc(size_t size, gfp_t flags);
@@@ -2559,11 -2569,19 +2558,11 @@@ extern const struct file_operations ext
  extern const struct inode_operations ext4_file_inode_operations;
  extern const struct file_operations ext4_file_operations;
  extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
 -extern void ext4_unwritten_wait(struct inode *inode);
  
  /* inline.c */
  extern int ext4_has_inline_data(struct inode *inode);
 -extern int ext4_get_inline_size(struct inode *inode);
  extern int ext4_get_max_inline_size(struct inode *inode);
  extern int ext4_find_inline_data_nolock(struct inode *inode);
 -extern void ext4_write_inline_data(struct inode *inode,
 -                                 struct ext4_iloc *iloc,
 -                                 void *buffer, loff_t pos,
 -                                 unsigned int len);
 -extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
 -                                  unsigned int len);
  extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
                                 unsigned int len);
  extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
@@@ -2751,11 -2769,13 +2750,11 @@@ extern void ext4_io_submit(struct ext4_
  extern int ext4_bio_write_page(struct ext4_io_submit *io,
                               struct page *page,
                               int len,
 -                             struct writeback_control *wbc);
 +                             struct writeback_control *wbc,
 +                             bool keep_towrite);
  
  /* mmp.c */
  extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
 -extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
 -extern int ext4_mmp_csum_verify(struct super_block *sb,
 -                              struct mmp_struct *mmp);
  
  /*
   * Note that these flags will never ever appear in a buffer_head's state flag.
diff --combined fs/ext4/file.c
index 4e8bc284ec0e96296e8bbcf68423b9ea9ee8c921,708aad7681991368262332520f09f5490af848de..8695f70af1ef2046c2f68a24a5ed4e195cd6dc88
@@@ -57,7 -57,7 +57,7 @@@ static int ext4_release_file(struct ino
        return 0;
  }
  
 -void ext4_unwritten_wait(struct inode *inode)
 +static void ext4_unwritten_wait(struct inode *inode)
  {
        wait_queue_head_t *wq = ext4_ioend_wq(inode);
  
   * or one thread will zero the other's data, causing corruption.
   */
  static int
- ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
-                  unsigned long nr_segs, loff_t pos)
+ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
  {
        struct super_block *sb = inode->i_sb;
        int blockmask = sb->s_blocksize - 1;
-       size_t count = iov_length(iov, nr_segs);
-       loff_t final_size = pos + count;
  
        if (pos >= i_size_read(inode))
                return 0;
  
-       if ((pos & blockmask) || (final_size & blockmask))
+       if ((pos | iov_iter_alignment(from)) & blockmask)
                return 1;
  
        return 0;
  }
  
  static ssize_t
- ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
-               unsigned long nr_segs, loff_t pos)
+ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(iocb->ki_filp);
        struct blk_plug plug;
        int o_direct = file->f_flags & O_DIRECT;
        int overwrite = 0;
-       size_t length = iov_length(iov, nr_segs);
+       size_t length = iov_iter_count(from);
        ssize_t ret;
-       BUG_ON(iocb->ki_pos != pos);
+       loff_t pos = iocb->ki_pos;
  
        /*
         * Unaligned direct AIO must be serialized; see comment above
            ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
            !is_sync_kiocb(iocb) &&
            (file->f_flags & O_APPEND ||
-            ext4_unaligned_aio(inode, iov, nr_segs, pos))) {
+            ext4_unaligned_aio(inode, from, pos))) {
                aio_mutex = ext4_aio_mutex(inode);
                mutex_lock(aio_mutex);
                ext4_unwritten_wait(inode);
                        goto errout;
                }
  
-               if (pos + length > sbi->s_bitmap_maxbytes) {
-                       nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
-                                             sbi->s_bitmap_maxbytes - pos);
-               }
+               if (pos + length > sbi->s_bitmap_maxbytes)
+                       iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
        }
  
        if (o_direct) {
                }
        }
  
-       ret = __generic_file_aio_write(iocb, iov, nr_segs);
+       ret = __generic_file_write_iter(iocb, from);
        mutex_unlock(&inode->i_mutex);
  
        if (ret > 0) {
@@@ -244,7 -237,6 +237,7 @@@ static int ext4_file_open(struct inode 
                        handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
                        if (IS_ERR(handle))
                                return PTR_ERR(handle);
 +                      BUFFER_TRACE(sbi->s_sbh, "get_write_access");
                        err = ext4_journal_get_write_access(handle, sbi->s_sbh);
                        if (err) {
                                ext4_journal_stop(handle);
@@@ -594,10 -586,10 +587,10 @@@ loff_t ext4_llseek(struct file *file, l
  
  const struct file_operations ext4_file_operations = {
        .llseek         = ext4_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = generic_file_aio_read,
-       .aio_write      = ext4_file_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = generic_file_read_iter,
+       .write_iter     = ext4_file_write_iter,
        .unlocked_ioctl = ext4_ioctl,
  #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext4_compat_ioctl,
        .release        = ext4_release_file,
        .fsync          = ext4_sync_file,
        .splice_read    = generic_file_splice_read,
-       .splice_write   = generic_file_splice_write,
+       .splice_write   = iter_file_splice_write,
        .fallocate      = ext4_fallocate,
  };
  
diff --combined fs/ext4/inode.c
index 7fcd68ee915500cd53ef79cbea1c187e71024004,b2cee73c14375fec3a01fe4f434886867d16a2e1..8a064734e6eb3ed06461e9954d036da6ff1e8147
@@@ -148,9 -148,6 +148,9 @@@ static int ext4_inode_is_fast_symlink(s
          int ea_blocks = EXT4_I(inode)->i_file_acl ?
                EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
  
 +      if (ext4_has_inline_data(inode))
 +              return 0;
 +
        return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
  }
  
@@@ -446,7 -443,7 +446,7 @@@ static void ext4_map_blocks_es_recheck(
         * could be converted.
         */
        if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
 -              down_read((&EXT4_I(inode)->i_data_sem));
 +              down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                retval = ext4_ext_map_blocks(handle, inode, map, flags &
                                             EXT4_GET_BLOCKS_KEEP_SIZE);
@@@ -558,7 -555,7 +558,7 @@@ int ext4_map_blocks(handle_t *handle, s
         * file system block.
         */
        if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
 -              down_read((&EXT4_I(inode)->i_data_sem));
 +              down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                retval = ext4_ext_map_blocks(handle, inode, map, flags &
                                             EXT4_GET_BLOCKS_KEEP_SIZE);
@@@ -630,7 -627,7 +630,7 @@@ found
         * the write lock of i_data_sem, and call get_blocks()
         * with create == 1 flag.
         */
 -      down_write((&EXT4_I(inode)->i_data_sem));
 +      down_write(&EXT4_I(inode)->i_data_sem);
  
        /*
         * if the caller is from delayed allocation writeout path
@@@ -925,7 -922,6 +925,7 @@@ int do_journal_get_write_access(handle_
         */
        if (dirty)
                clear_buffer_dirty(bh);
 +      BUFFER_TRACE(bh, "get write access");
        ret = ext4_journal_get_write_access(handle, bh);
        if (!ret && dirty)
                ret = ext4_handle_dirty_metadata(handle, NULL, bh);
@@@ -1544,7 -1540,7 +1544,7 @@@ static int ext4_da_map_blocks(struct in
                ext4_es_lru_add(inode);
                if (ext4_es_is_hole(&es)) {
                        retval = 0;
 -                      down_read((&EXT4_I(inode)->i_data_sem));
 +                      down_read(&EXT4_I(inode)->i_data_sem);
                        goto add_delayed;
                }
  
         * Try to see if we can get the block without requesting a new
         * file system block.
         */
 -      down_read((&EXT4_I(inode)->i_data_sem));
 +      down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_has_inline_data(inode)) {
                /*
                 * We will soon create blocks for this page, and let
@@@ -1773,7 -1769,6 +1773,7 @@@ static int __ext4_journalled_writepage(
        BUG_ON(!ext4_handle_valid(handle));
  
        if (inline_data) {
 +              BUFFER_TRACE(inode_bh, "get write access");
                ret = ext4_journal_get_write_access(handle, inode_bh);
  
                err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
@@@ -1851,7 -1846,6 +1851,7 @@@ static int ext4_writepage(struct page *
        struct buffer_head *page_bufs = NULL;
        struct inode *inode = page->mapping->host;
        struct ext4_io_submit io_submit;
 +      bool keep_towrite = false;
  
        trace_ext4_writepage(page);
        size = i_size_read(inode);
                        unlock_page(page);
                        return 0;
                }
 +              keep_towrite = true;
        }
  
        if (PageChecked(page) && ext4_should_journal_data(inode))
                unlock_page(page);
                return -ENOMEM;
        }
 -      ret = ext4_bio_write_page(&io_submit, page, len, wbc);
 +      ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
        ext4_io_submit(&io_submit);
        /* Drop io_end reference we got from init */
        ext4_put_io_end_defer(io_submit.io_end);
@@@ -1918,7 -1911,7 +1918,7 @@@ static int mpage_submit_page(struct mpa
        else
                len = PAGE_CACHE_SIZE;
        clear_page_dirty_for_io(page);
 -      err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
 +      err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
        if (!err)
                mpd->wbc->nr_to_write--;
        mpd->first_page++;
@@@ -3093,13 -3086,12 +3093,12 @@@ static void ext4_end_io_dio(struct kioc
   *
   */
  static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
-                             const struct iovec *iov, loff_t offset,
-                             unsigned long nr_segs)
+                             struct iov_iter *iter, loff_t offset)
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;
-       size_t count = iov_length(iov, nr_segs);
+       size_t count = iov_iter_count(iter);
        int overwrite = 0;
        get_block_t *get_block_func = NULL;
        int dio_flags = 0;
  
        /* Use the old path for reads and writes beyond i_size. */
        if (rw != WRITE || final_size > inode->i_size)
-               return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+               return ext4_ind_direct_IO(rw, iocb, iter, offset);
  
        BUG_ON(iocb->private == NULL);
  
                dio_flags = DIO_LOCKING;
        }
        ret = __blockdev_direct_IO(rw, iocb, inode,
-                                  inode->i_sb->s_bdev, iov,
-                                  offset, nr_segs,
+                                  inode->i_sb->s_bdev, iter,
+                                  offset,
                                   get_block_func,
                                   ext4_end_io_dio,
                                   NULL,
@@@ -3230,11 -3222,11 +3229,11 @@@ retake_lock
  }
  
  static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
-                             const struct iovec *iov, loff_t offset,
-                             unsigned long nr_segs)
+                             struct iov_iter *iter, loff_t offset)
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+       size_t count = iov_iter_count(iter);
        ssize_t ret;
  
        /*
        if (ext4_has_inline_data(inode))
                return 0;
  
-       trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
+       trace_ext4_direct_IO_enter(inode, offset, count, rw);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-               ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+               ret = ext4_ext_direct_IO(rw, iocb, iter, offset);
        else
-               ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
-       trace_ext4_direct_IO_exit(inode, offset,
-                               iov_length(iov, nr_segs), rw, ret);
+               ret = ext4_ind_direct_IO(rw, iocb, iter, offset);
+       trace_ext4_direct_IO_exit(inode, offset, count, rw, ret);
        return ret;
  }
  
@@@ -3448,7 -3439,7 +3446,7 @@@ unlock
   * This required during truncate. We need to physically zero the tail end
   * of that block so it doesn't yield old data if the file is later grown.
   */
 -int ext4_block_truncate_page(handle_t *handle,
 +static int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from)
  {
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
@@@ -4312,15 -4303,12 +4310,15 @@@ static int ext4_do_update_inode(handle_
        struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct buffer_head *bh = iloc->bh;
 +      struct super_block *sb = inode->i_sb;
        int err = 0, rc, block;
 -      int need_datasync = 0;
 +      int need_datasync = 0, set_large_file = 0;
        uid_t i_uid;
        gid_t i_gid;
  
 -      /* For fields not not tracking in the in-memory inode,
 +      spin_lock(&ei->i_raw_lock);
 +
 +      /* For fields not tracked in the in-memory inode,
         * initialise them to zero for new inodes. */
        if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
        EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
        EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
  
 -      if (ext4_inode_blocks_set(handle, raw_inode, ei))
 +      if (ext4_inode_blocks_set(handle, raw_inode, ei)) {
 +              spin_unlock(&ei->i_raw_lock);
                goto out_brelse;
 +      }
        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
        raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
                need_datasync = 1;
        }
        if (ei->i_disksize > 0x7fffffffULL) {
 -              struct super_block *sb = inode->i_sb;
                if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
                                EXT4_SB(sb)->s_es->s_rev_level ==
 -                              cpu_to_le32(EXT4_GOOD_OLD_REV)) {
 -                      /* If this is the first large file
 -                       * created, add a flag to the superblock.
 -                       */
 -                      err = ext4_journal_get_write_access(handle,
 -                                      EXT4_SB(sb)->s_sbh);
 -                      if (err)
 -                              goto out_brelse;
 -                      ext4_update_dynamic_rev(sb);
 -                      EXT4_SET_RO_COMPAT_FEATURE(sb,
 -                                      EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
 -                      ext4_handle_sync(handle);
 -                      err = ext4_handle_dirty_super(handle, sb);
 -              }
 +                  cpu_to_le32(EXT4_GOOD_OLD_REV))
 +                      set_large_file = 1;
        }
        raw_inode->i_generation = cpu_to_le32(inode->i_generation);
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
  
        ext4_inode_csum_set(inode, raw_inode, ei);
  
 +      spin_unlock(&ei->i_raw_lock);
 +
        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
        rc = ext4_handle_dirty_metadata(handle, NULL, bh);
        if (!err)
                err = rc;
        ext4_clear_inode_state(inode, EXT4_STATE_NEW);
 -
 +      if (set_large_file) {
 +              BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
 +              err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
 +              if (err)
 +                      goto out_brelse;
 +              ext4_update_dynamic_rev(sb);
 +              EXT4_SET_RO_COMPAT_FEATURE(sb,
 +                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
 +              ext4_handle_sync(handle);
 +              err = ext4_handle_dirty_super(handle, sb);
 +      }
        ext4_update_inode_fsync_trans(handle, inode, need_datasync);
  out_brelse:
        brelse(bh);
diff --combined fs/f2fs/data.c
index c1fb6dd10911c01e9b37d533a7588ee6bf934ecb,1d2e7e9624d2c4f5fc372a0f86be916169deebe4..0924521306b40c5087f2c2170c92fe7b03452862
@@@ -417,7 -417,7 +417,7 @@@ struct page *find_data_page(struct inod
        if (unlikely(dn.data_blkaddr == NEW_ADDR))
                return ERR_PTR(-EINVAL);
  
 -      page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
 +      page = grab_cache_page(mapping, index);
        if (!page)
                return ERR_PTR(-ENOMEM);
  
@@@ -455,7 -455,7 +455,7 @@@ struct page *get_lock_data_page(struct 
        int err;
  
  repeat:
 -      page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
 +      page = grab_cache_page(mapping, index);
        if (!page)
                return ERR_PTR(-ENOMEM);
  
@@@ -652,7 -652,8 +652,7 @@@ static int get_data_block(struct inode 
                goto put_out;
        }
  
 -      end_offset = IS_INODE(dn.node_page) ?
 -                      ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
 +      end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
        bh_result->b_size = (((size_t)1) << blkbits);
        dn.ofs_in_node++;
        pgofs++;
@@@ -674,7 -675,8 +674,7 @@@ get_next
                if (dn.data_blkaddr == NEW_ADDR)
                        goto put_out;
  
 -              end_offset = IS_INODE(dn.node_page) ?
 -                      ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
 +              end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
        }
  
        if (maxblocks > (bh_result->b_size >> blkbits)) {
@@@ -708,19 -710,11 +708,19 @@@ out
        return err;
  }
  
 +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 +              u64 start, u64 len)
 +{
 +      return generic_block_fiemap(inode, fieinfo, start, len, get_data_block);
 +}
 +
  static int f2fs_read_data_page(struct file *file, struct page *page)
  {
        struct inode *inode = page->mapping->host;
        int ret;
  
 +      trace_f2fs_readpage(page, DATA);
 +
        /* If the file has inline data, try to read it directlly */
        if (f2fs_has_inline_data(inode))
                ret = f2fs_read_inline_data(inode, page);
@@@ -796,8 -790,6 +796,8 @@@ static int f2fs_write_data_page(struct 
                .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
        };
  
 +      trace_f2fs_writepage(page, DATA);
 +
        if (page->index < end_index)
                goto write;
  
         * this page does not have to be written to disk.
         */
        offset = i_size & (PAGE_CACHE_SIZE - 1);
 -      if ((page->index >= end_index + 1) || !offset) {
 -              inode_dec_dirty_dents(inode);
 +      if ((page->index >= end_index + 1) || !offset)
                goto out;
 -      }
  
        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
  write:
  
        /* Dentry blocks are controlled by checkpoint */
        if (S_ISDIR(inode->i_mode)) {
 -              inode_dec_dirty_dents(inode);
                err = do_write_data_page(page, &fio);
                goto done;
        }
@@@ -837,16 -832,15 +837,16 @@@ done
  
        clear_cold_data(page);
  out:
 +      inode_dec_dirty_dents(inode);
        unlock_page(page);
        if (need_balance_fs)
                f2fs_balance_fs(sbi);
 +      if (wbc->for_reclaim)
 +              f2fs_submit_merged_bio(sbi, DATA, WRITE);
        return 0;
  
  redirty_out:
 -      wbc->pages_skipped++;
 -      account_page_redirty(page);
 -      set_page_dirty(page);
 +      redirty_page_for_writepage(wbc, page);
        return AOP_WRITEPAGE_ACTIVATE;
  }
  
@@@ -868,15 -862,12 +868,15 @@@ static int f2fs_write_data_pages(struc
        int ret;
        long diff;
  
 +      trace_f2fs_writepages(mapping->host, wbc, DATA);
 +
        /* deal with chardevs and other special file */
        if (!mapping->a_ops->writepage)
                return 0;
  
        if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
 -                      get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA))
 +                      get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA) &&
 +                      available_free_memory(sbi, DIRTY_DENTS))
                goto skip_write;
  
        diff = nr_pages_to_write(sbi, DATA, wbc);
@@@ -912,8 -903,6 +912,8 @@@ static int f2fs_write_begin(struct fil
        struct dnode_of_data dn;
        int err = 0;
  
 +      trace_f2fs_write_begin(inode, pos, len, flags);
 +
        f2fs_balance_fs(sbi);
  repeat:
        err = f2fs_convert_inline_data(inode, pos + len);
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
 +
 +      /* to avoid latency during memory pressure */
 +      unlock_page(page);
 +
        *pagep = page;
  
        if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA)
        f2fs_unlock_op(sbi);
  
        if (err) {
 -              f2fs_put_page(page, 1);
 +              f2fs_put_page(page, 0);
                return err;
        }
  inline_data:
 +      lock_page(page);
 +      if (unlikely(page->mapping != mapping)) {
 +              f2fs_put_page(page, 1);
 +              goto repeat;
 +      }
 +
 +      f2fs_wait_on_page_writeback(page, DATA);
 +
        if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
                return 0;
  
@@@ -1001,8 -978,6 +1001,8 @@@ static int f2fs_write_end(struct file *
  {
        struct inode *inode = page->mapping->host;
  
 +      trace_f2fs_write_end(inode, pos, len, copied);
 +
        SetPageUptodate(page);
        set_page_dirty(page);
  
  }
  
  static int check_direct_IO(struct inode *inode, int rw,
-               const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+               struct iov_iter *iter, loff_t offset)
  {
        unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
-       int i;
  
        if (rw == READ)
                return 0;
        if (offset & blocksize_mask)
                return -EINVAL;
  
-       for (i = 0; i < nr_segs; i++)
-               if (iov[i].iov_len & blocksize_mask)
-                       return -EINVAL;
+       if (iov_iter_alignment(iter) & blocksize_mask)
+               return -EINVAL;
        return 0;
  }
  
  static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
-               const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+               struct iov_iter *iter, loff_t offset)
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        if (f2fs_has_inline_data(inode))
                return 0;
  
-       if (check_direct_IO(inode, rw, iov, offset, nr_segs))
+       if (check_direct_IO(inode, rw, iter, offset))
                return 0;
  
-       return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-                                                       get_data_block);
 +      /* clear fsync mark to recover these blocks */
 +      fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino);
 +
+       return blockdev_direct_IO(rw, iocb, inode, iter, offset,
+                                 get_data_block);
  }
  
  static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
@@@ -1089,11 -1060,6 +1088,11 @@@ static int f2fs_set_data_page_dirty(str
  
  static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
  {
 +      struct inode *inode = mapping->host;
 +
 +      if (f2fs_has_inline_data(inode))
 +              return 0;
 +
        return generic_block_bmap(mapping, block, get_data_block);
  }
  
diff --combined fs/f2fs/file.c
index 9c49c593d8eb4ab39a1aa28c1b841f949d02c050,e4ba4b93f96a90e70c173fa309b242c0390ef8cf..c58e330757191392656d2819fd937a1cc564cb37
@@@ -19,7 -19,6 +19,7 @@@
  #include <linux/compat.h>
  #include <linux/uaccess.h>
  #include <linux/mount.h>
 +#include <linux/pagevec.h>
  
  #include "f2fs.h"
  #include "node.h"
@@@ -195,132 -194,6 +195,132 @@@ out
        return ret;
  }
  
 +static pgoff_t __get_first_dirty_index(struct address_space *mapping,
 +                                              pgoff_t pgofs, int whence)
 +{
 +      struct pagevec pvec;
 +      int nr_pages;
 +
 +      if (whence != SEEK_DATA)
 +              return 0;
 +
 +      /* find first dirty page index */
 +      pagevec_init(&pvec, 0);
 +      nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1);
 +      pgofs = nr_pages ? pvec.pages[0]->index: LONG_MAX;
 +      pagevec_release(&pvec);
 +      return pgofs;
 +}
 +
 +static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs,
 +                                                      int whence)
 +{
 +      switch (whence) {
 +      case SEEK_DATA:
 +              if ((blkaddr == NEW_ADDR && dirty == pgofs) ||
 +                      (blkaddr != NEW_ADDR && blkaddr != NULL_ADDR))
 +                      return true;
 +              break;
 +      case SEEK_HOLE:
 +              if (blkaddr == NULL_ADDR)
 +                      return true;
 +              break;
 +      }
 +      return false;
 +}
 +
 +static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 +{
 +      struct inode *inode = file->f_mapping->host;
 +      loff_t maxbytes = inode->i_sb->s_maxbytes;
 +      struct dnode_of_data dn;
 +      pgoff_t pgofs, end_offset, dirty;
 +      loff_t data_ofs = offset;
 +      loff_t isize;
 +      int err = 0;
 +
 +      mutex_lock(&inode->i_mutex);
 +
 +      isize = i_size_read(inode);
 +      if (offset >= isize)
 +              goto fail;
 +
 +      /* handle inline data case */
 +      if (f2fs_has_inline_data(inode)) {
 +              if (whence == SEEK_HOLE)
 +                      data_ofs = isize;
 +              goto found;
 +      }
 +
 +      pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT);
 +
 +      dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence);
 +
 +      for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) {
 +              set_new_dnode(&dn, inode, NULL, NULL, 0);
 +              err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
 +              if (err && err != -ENOENT) {
 +                      goto fail;
 +              } else if (err == -ENOENT) {
 +                      /* direct node is not exist */
 +                      if (whence == SEEK_DATA) {
 +                              pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
 +                                                      F2FS_I(inode));
 +                              continue;
 +                      } else {
 +                              goto found;
 +                      }
 +              }
 +
 +              end_offset = IS_INODE(dn.node_page) ?
 +                      ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
 +
 +              /* find data/hole in dnode block */
 +              for (; dn.ofs_in_node < end_offset;
 +                              dn.ofs_in_node++, pgofs++,
 +                              data_ofs = pgofs << PAGE_CACHE_SHIFT) {
 +                      block_t blkaddr;
 +                      blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
 +
 +                      if (__found_offset(blkaddr, dirty, pgofs, whence)) {
 +                              f2fs_put_dnode(&dn);
 +                              goto found;
 +                      }
 +              }
 +              f2fs_put_dnode(&dn);
 +      }
 +
 +      if (whence == SEEK_DATA)
 +              goto fail;
 +found:
 +      if (whence == SEEK_HOLE && data_ofs > isize)
 +              data_ofs = isize;
 +      mutex_unlock(&inode->i_mutex);
 +      return vfs_setpos(file, data_ofs, maxbytes);
 +fail:
 +      mutex_unlock(&inode->i_mutex);
 +      return -ENXIO;
 +}
 +
 +static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
 +{
 +      struct inode *inode = file->f_mapping->host;
 +      loff_t maxbytes = inode->i_sb->s_maxbytes;
 +
 +      switch (whence) {
 +      case SEEK_SET:
 +      case SEEK_CUR:
 +      case SEEK_END:
 +              return generic_file_llseek_size(file, offset, whence,
 +                                              maxbytes, i_size_read(inode));
 +      case SEEK_DATA:
 +      case SEEK_HOLE:
 +              return f2fs_seek_block(file, offset, whence);
 +      }
 +
 +      return -EINVAL;
 +}
 +
  static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
  {
        file_accessed(file);
@@@ -369,9 -242,6 +369,9 @@@ static void truncate_partial_data_page(
        unsigned offset = from & (PAGE_CACHE_SIZE - 1);
        struct page *page;
  
 +      if (f2fs_has_inline_data(inode))
 +              return truncate_inline_data(inode, from);
 +
        if (!offset)
                return;
  
@@@ -418,7 -288,10 +418,7 @@@ int truncate_blocks(struct inode *inode
                return err;
        }
  
 -      if (IS_INODE(dn.node_page))
 -              count = ADDRS_PER_INODE(F2FS_I(inode));
 -      else
 -              count = ADDRS_PER_BLOCK;
 +      count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
  
        count -= dn.ofs_in_node;
        f2fs_bug_on(count < 0);
@@@ -540,7 -413,6 +540,7 @@@ const struct inode_operations f2fs_file
        .listxattr      = f2fs_listxattr,
        .removexattr    = generic_removexattr,
  #endif
 +      .fiemap         = f2fs_fiemap,
  };
  
  static void fill_zero(struct inode *inode, pgoff_t index,
@@@ -683,7 -555,6 +683,7 @@@ static int expand_inode_data(struct ino
                i_size_read(inode) < new_size) {
                i_size_write(inode, new_size);
                mark_inode_dirty(inode);
 +              f2fs_write_inode(inode, NULL);
        }
  
        return ret;
@@@ -807,11 -678,11 +807,11 @@@ long f2fs_compat_ioctl(struct file *fil
  #endif
  
  const struct file_operations f2fs_file_operations = {
 -      .llseek         = generic_file_llseek,
 +      .llseek         = f2fs_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = generic_file_aio_read,
-       .aio_write      = generic_file_aio_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = generic_file_read_iter,
+       .write_iter     = generic_file_write_iter,
        .open           = generic_file_open,
        .mmap           = f2fs_file_mmap,
        .fsync          = f2fs_sync_file,
        .compat_ioctl   = f2fs_compat_ioctl,
  #endif
        .splice_read    = generic_file_splice_read,
-       .splice_write   = generic_file_splice_write,
+       .splice_write   = iter_file_splice_write,
  };
diff --combined fs/fat/inode.c
index 9c83594d7fb5dbb03e5eb7ca378c4b35ffd3d85f,385cce464e822a3a9de91113d1b314482670965f..756aead10d9618593e3267e697df4915d528fbc9
  #define CONFIG_FAT_DEFAULT_IOCHARSET  ""
  #endif
  
 +#define KB_IN_SECTORS 2
 +
 +/*
 + * A deserialized copy of the on-disk structure laid out in struct
 + * fat_boot_sector.
 + */
 +struct fat_bios_param_block {
 +      u16     fat_sector_size;
 +      u8      fat_sec_per_clus;
 +      u16     fat_reserved;
 +      u8      fat_fats;
 +      u16     fat_dir_entries;
 +      u16     fat_sectors;
 +      u16     fat_fat_length;
 +      u32     fat_total_sect;
 +
 +      u8      fat16_state;
 +      u32     fat16_vol_id;
 +
 +      u32     fat32_length;
 +      u32     fat32_root_cluster;
 +      u16     fat32_info_sector;
 +      u8      fat32_state;
 +      u32     fat32_vol_id;
 +};
 +
  static int fat_default_codepage = CONFIG_FAT_DEFAULT_CODEPAGE;
  static char fat_default_iocharset[] = CONFIG_FAT_DEFAULT_IOCHARSET;
  
 +static struct fat_floppy_defaults {
 +      unsigned nr_sectors;
 +      unsigned sec_per_clus;
 +      unsigned dir_entries;
 +      unsigned media;
 +      unsigned fat_length;
 +} floppy_defaults[] = {
 +{
 +      .nr_sectors = 160 * KB_IN_SECTORS,
 +      .sec_per_clus = 1,
 +      .dir_entries = 64,
 +      .media = 0xFE,
 +      .fat_length = 1,
 +},
 +{
 +      .nr_sectors = 180 * KB_IN_SECTORS,
 +      .sec_per_clus = 1,
 +      .dir_entries = 64,
 +      .media = 0xFC,
 +      .fat_length = 2,
 +},
 +{
 +      .nr_sectors = 320 * KB_IN_SECTORS,
 +      .sec_per_clus = 2,
 +      .dir_entries = 112,
 +      .media = 0xFF,
 +      .fat_length = 1,
 +},
 +{
 +      .nr_sectors = 360 * KB_IN_SECTORS,
 +      .sec_per_clus = 2,
 +      .dir_entries = 112,
 +      .media = 0xFD,
 +      .fat_length = 2,
 +},
 +};
  
  static int fat_add_cluster(struct inode *inode)
  {
@@@ -247,12 -185,13 +247,13 @@@ static int fat_write_end(struct file *f
  }
  
  static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
-                            const struct iovec *iov,
-                            loff_t offset, unsigned long nr_segs)
+                            struct iov_iter *iter,
+                            loff_t offset)
  {
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
+       size_t count = iov_iter_count(iter);
        ssize_t ret;
  
        if (rw == WRITE) {
                 *
                 * Return 0, and fallback to normal buffered write.
                 */
-               loff_t size = offset + iov_length(iov, nr_segs);
+               loff_t size = offset + count;
                if (MSDOS_I(inode)->mmu_private < size)
                        return 0;
        }
         * FAT need to use the DIO_LOCKING for avoiding the race
         * condition of fat_get_block() and ->truncate().
         */
-       ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-                                fat_get_block);
+       ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, fat_get_block);
        if (ret < 0 && (rw & WRITE))
-               fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
+               fat_write_failed(mapping, offset + count);
  
        return ret;
  }
@@@ -421,7 -359,7 +421,7 @@@ struct inode *fat_iget(struct super_blo
  
  static int is_exec(unsigned char *extension)
  {
 -      unsigned char *exe_extensions = "EXECOMBAT", *walk;
 +      unsigned char exe_extensions[] = "EXECOMBAT", *walk;
  
        for (walk = exe_extensions; *walk; walk += 3)
                if (!strncmp(extension, walk, 3))
@@@ -915,8 -853,6 +915,8 @@@ static int fat_show_options(struct seq_
                seq_puts(m, ",nfs=stale_rw");
        if (opts->discard)
                seq_puts(m, ",discard");
 +      if (opts->dos1xfloppy)
 +              seq_puts(m, ",dos1xfloppy");
  
        return 0;
  }
@@@ -931,7 -867,7 +931,7 @@@ enum 
        Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
        Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
        Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset,
 -      Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err,
 +      Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy,
  };
  
  static const match_table_t fat_tokens = {
        {Opt_nfs_stale_rw, "nfs"},
        {Opt_nfs_stale_rw, "nfs=stale_rw"},
        {Opt_nfs_nostale_ro, "nfs=nostale_ro"},
 +      {Opt_dos1xfloppy, "dos1xfloppy"},
        {Opt_obsolete, "conv=binary"},
        {Opt_obsolete, "conv=text"},
        {Opt_obsolete, "conv=auto"},
@@@ -1167,9 -1102,6 +1167,9 @@@ static int parse_options(struct super_b
                case Opt_nfs_nostale_ro:
                        opts->nfs = FAT_NFS_NOSTALE_RO;
                        break;
 +              case Opt_dos1xfloppy:
 +                      opts->dos1xfloppy = 1;
 +                      break;
  
                /* msdos specific */
                case Opt_dots:
@@@ -1315,169 -1247,6 +1315,169 @@@ static unsigned long calc_fat_clusters(
        return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;
  }
  
 +static bool fat_bpb_is_zero(struct fat_boot_sector *b)
 +{
 +      if (get_unaligned_le16(&b->sector_size))
 +              return false;
 +      if (b->sec_per_clus)
 +              return false;
 +      if (b->reserved)
 +              return false;
 +      if (b->fats)
 +              return false;
 +      if (get_unaligned_le16(&b->dir_entries))
 +              return false;
 +      if (get_unaligned_le16(&b->sectors))
 +              return false;
 +      if (b->media)
 +              return false;
 +      if (b->fat_length)
 +              return false;
 +      if (b->secs_track)
 +              return false;
 +      if (b->heads)
 +              return false;
 +      return true;
 +}
 +
 +static int fat_read_bpb(struct super_block *sb, struct fat_boot_sector *b,
 +      int silent, struct fat_bios_param_block *bpb)
 +{
 +      int error = -EINVAL;
 +
 +      /* Read in BPB ... */
 +      memset(bpb, 0, sizeof(*bpb));
 +      bpb->fat_sector_size = get_unaligned_le16(&b->sector_size);
 +      bpb->fat_sec_per_clus = b->sec_per_clus;
 +      bpb->fat_reserved = le16_to_cpu(b->reserved);
 +      bpb->fat_fats = b->fats;
 +      bpb->fat_dir_entries = get_unaligned_le16(&b->dir_entries);
 +      bpb->fat_sectors = get_unaligned_le16(&b->sectors);
 +      bpb->fat_fat_length = le16_to_cpu(b->fat_length);
 +      bpb->fat_total_sect = le32_to_cpu(b->total_sect);
 +
 +      bpb->fat16_state = b->fat16.state;
 +      bpb->fat16_vol_id = get_unaligned_le32(b->fat16.vol_id);
 +
 +      bpb->fat32_length = le32_to_cpu(b->fat32.length);
 +      bpb->fat32_root_cluster = le32_to_cpu(b->fat32.root_cluster);
 +      bpb->fat32_info_sector = le16_to_cpu(b->fat32.info_sector);
 +      bpb->fat32_state = b->fat32.state;
 +      bpb->fat32_vol_id = get_unaligned_le32(b->fat32.vol_id);
 +
 +      /* Validate this looks like a FAT filesystem BPB */
 +      if (!bpb->fat_reserved) {
 +              if (!silent)
 +                      fat_msg(sb, KERN_ERR,
 +                              "bogus number of reserved sectors");
 +              goto out;
 +      }
 +      if (!bpb->fat_fats) {
 +              if (!silent)
 +                      fat_msg(sb, KERN_ERR, "bogus number of FAT structure");
 +              goto out;
 +      }
 +
 +      /*
 +       * Earlier we checked here that b->secs_track and b->head are nonzero,
 +       * but it turns out valid FAT filesystems can have zero there.
 +       */
 +
 +      if (!fat_valid_media(b->media)) {
 +              if (!silent)
 +                      fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)",
 +                              (unsigned)b->media);
 +              goto out;
 +      }
 +
 +      if (!is_power_of_2(bpb->fat_sector_size)
 +          || (bpb->fat_sector_size < 512)
 +          || (bpb->fat_sector_size > 4096)) {
 +              if (!silent)
 +                      fat_msg(sb, KERN_ERR, "bogus logical sector size %u",
 +                             (unsigned)bpb->fat_sector_size);
 +              goto out;
 +      }
 +
 +      if (!is_power_of_2(bpb->fat_sec_per_clus)) {
 +              if (!silent)
 +                      fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u",
 +                              (unsigned)bpb->fat_sec_per_clus);
 +              goto out;
 +      }
 +
 +      error = 0;
 +
 +out:
 +      return error;
 +}
 +
 +static int fat_read_static_bpb(struct super_block *sb,
 +      struct fat_boot_sector *b, int silent,
 +      struct fat_bios_param_block *bpb)
 +{
 +      static const char *notdos1x = "This doesn't look like a DOS 1.x volume";
 +
 +      struct fat_floppy_defaults *fdefaults = NULL;
 +      int error = -EINVAL;
 +      sector_t bd_sects;
 +      unsigned i;
 +
 +      bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE;
 +
 +      /* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */
 +      if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) {
 +              if (!silent)
 +                      fat_msg(sb, KERN_ERR,
 +                              "%s; no bootstrapping code", notdos1x);
 +              goto out;
 +      }
 +
 +      /*
 +       * If any value in this region is non-zero, it isn't archaic
 +       * DOS.
 +       */
 +      if (!fat_bpb_is_zero(b)) {
 +              if (!silent)
 +                      fat_msg(sb, KERN_ERR,
 +                              "%s; DOS 2.x BPB is non-zero", notdos1x);
 +              goto out;
 +      }
 +
 +      for (i = 0; i < ARRAY_SIZE(floppy_defaults); i++) {
 +              if (floppy_defaults[i].nr_sectors == bd_sects) {
 +                      fdefaults = &floppy_defaults[i];
 +                      break;
 +              }
 +      }
 +
 +      if (fdefaults == NULL) {
 +              if (!silent)
 +                      fat_msg(sb, KERN_WARNING,
 +                              "This looks like a DOS 1.x volume, but isn't a recognized floppy size (%llu sectors)",
 +                              (u64)bd_sects);
 +              goto out;
 +      }
 +
 +      if (!silent)
 +              fat_msg(sb, KERN_INFO,
 +                      "This looks like a DOS 1.x volume; assuming default BPB values");
 +
 +      memset(bpb, 0, sizeof(*bpb));
 +      bpb->fat_sector_size = SECTOR_SIZE;
 +      bpb->fat_sec_per_clus = fdefaults->sec_per_clus;
 +      bpb->fat_reserved = 1;
 +      bpb->fat_fats = 2;
 +      bpb->fat_dir_entries = fdefaults->dir_entries;
 +      bpb->fat_sectors = fdefaults->nr_sectors;
 +      bpb->fat_fat_length = fdefaults->fat_length;
 +
 +      error = 0;
 +
 +out:
 +      return error;
 +}
 +
  /*
   * Read the super block of an MS-DOS FS.
   */
@@@ -1487,11 -1256,12 +1487,11 @@@ int fat_fill_super(struct super_block *
        struct inode *root_inode = NULL, *fat_inode = NULL;
        struct inode *fsinfo_inode = NULL;
        struct buffer_head *bh;
 -      struct fat_boot_sector *b;
 +      struct fat_bios_param_block bpb;
        struct msdos_sb_info *sbi;
        u16 logical_sector_size;
        u32 total_sectors, total_clusters, fat_clusters, rootdir_sectors;
        int debug;
 -      unsigned int media;
        long error;
        char buf[50];
  
                goto out_fail;
        }
  
 -      b = (struct fat_boot_sector *) bh->b_data;
 -      if (!b->reserved) {
 -              if (!silent)
 -                      fat_msg(sb, KERN_ERR, "bogus number of reserved sectors");
 -              brelse(bh);
 -              goto out_invalid;
 -      }
 -      if (!b->fats) {
 -              if (!silent)
 -                      fat_msg(sb, KERN_ERR, "bogus number of FAT structure");
 -              brelse(bh);
 -              goto out_invalid;
 -      }
 -
 -      /*
 -       * Earlier we checked here that b->secs_track and b->head are nonzero,
 -       * but it turns out valid FAT filesystems can have zero there.
 -       */
 +      error = fat_read_bpb(sb, (struct fat_boot_sector *)bh->b_data, silent,
 +              &bpb);
 +      if (error == -EINVAL && sbi->options.dos1xfloppy)
 +              error = fat_read_static_bpb(sb,
 +                      (struct fat_boot_sector *)bh->b_data, silent, &bpb);
 +      brelse(bh);
  
 -      media = b->media;
 -      if (!fat_valid_media(media)) {
 -              if (!silent)
 -                      fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)",
 -                             media);
 -              brelse(bh);
 +      if (error == -EINVAL)
                goto out_invalid;
 -      }
 -      logical_sector_size = get_unaligned_le16(&b->sector_size);
 -      if (!is_power_of_2(logical_sector_size)
 -          || (logical_sector_size < 512)
 -          || (logical_sector_size > 4096)) {
 -              if (!silent)
 -                      fat_msg(sb, KERN_ERR, "bogus logical sector size %u",
 -                             logical_sector_size);
 -              brelse(bh);
 -              goto out_invalid;
 -      }
 -      sbi->sec_per_clus = b->sec_per_clus;
 -      if (!is_power_of_2(sbi->sec_per_clus)) {
 -              if (!silent)
 -                      fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u",
 -                             sbi->sec_per_clus);
 -              brelse(bh);
 -              goto out_invalid;
 -      }
 +      else if (error)
 +              goto out_fail;
  
 +      logical_sector_size = bpb.fat_sector_size;
 +      sbi->sec_per_clus = bpb.fat_sec_per_clus;
 +
 +      error = -EIO;
        if (logical_sector_size < sb->s_blocksize) {
                fat_msg(sb, KERN_ERR, "logical sector size too small for device"
                       " (logical sector size = %u)", logical_sector_size);
 -              brelse(bh);
                goto out_fail;
        }
 +
        if (logical_sector_size > sb->s_blocksize) {
 -              brelse(bh);
 +              struct buffer_head *bh_resize;
  
                if (!sb_set_blocksize(sb, logical_sector_size)) {
                        fat_msg(sb, KERN_ERR, "unable to set blocksize %u",
                               logical_sector_size);
                        goto out_fail;
                }
 -              bh = sb_bread(sb, 0);
 -              if (bh == NULL) {
 +
 +              /* Verify that the larger boot sector is fully readable */
 +              bh_resize = sb_bread(sb, 0);
 +              if (bh_resize == NULL) {
                        fat_msg(sb, KERN_ERR, "unable to read boot sector"
                               " (logical sector size = %lu)",
                               sb->s_blocksize);
                        goto out_fail;
                }
 -              b = (struct fat_boot_sector *) bh->b_data;
 +              brelse(bh_resize);
        }
  
        mutex_init(&sbi->s_lock);
        sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus;
        sbi->cluster_bits = ffs(sbi->cluster_size) - 1;
 -      sbi->fats = b->fats;
 +      sbi->fats = bpb.fat_fats;
        sbi->fat_bits = 0;              /* Don't know yet */
 -      sbi->fat_start = le16_to_cpu(b->reserved);
 -      sbi->fat_length = le16_to_cpu(b->fat_length);
 +      sbi->fat_start = bpb.fat_reserved;
 +      sbi->fat_length = bpb.fat_fat_length;
        sbi->root_cluster = 0;
        sbi->free_clusters = -1;        /* Don't know yet */
        sbi->free_clus_valid = 0;
        sbi->prev_free = FAT_START_ENT;
        sb->s_maxbytes = 0xffffffff;
  
 -      if (!sbi->fat_length && b->fat32.length) {
 +      if (!sbi->fat_length && bpb.fat32_length) {
                struct fat_boot_fsinfo *fsinfo;
                struct buffer_head *fsinfo_bh;
  
                /* Must be FAT32 */
                sbi->fat_bits = 32;
 -              sbi->fat_length = le32_to_cpu(b->fat32.length);
 -              sbi->root_cluster = le32_to_cpu(b->fat32.root_cluster);
 +              sbi->fat_length = bpb.fat32_length;
 +              sbi->root_cluster = bpb.fat32_root_cluster;
  
                /* MC - if info_sector is 0, don't multiply by 0 */
 -              sbi->fsinfo_sector = le16_to_cpu(b->fat32.info_sector);
 +              sbi->fsinfo_sector = bpb.fat32_info_sector;
                if (sbi->fsinfo_sector == 0)
                        sbi->fsinfo_sector = 1;
  
                if (fsinfo_bh == NULL) {
                        fat_msg(sb, KERN_ERR, "bread failed, FSINFO block"
                               " (sector = %lu)", sbi->fsinfo_sector);
 -                      brelse(bh);
                        goto out_fail;
                }
  
  
        /* interpret volume ID as a little endian 32 bit integer */
        if (sbi->fat_bits == 32)
 -              sbi->vol_id = (((u32)b->fat32.vol_id[0]) |
 -                                      ((u32)b->fat32.vol_id[1] << 8) |
 -                                      ((u32)b->fat32.vol_id[2] << 16) |
 -                                      ((u32)b->fat32.vol_id[3] << 24));
 +              sbi->vol_id = bpb.fat32_vol_id;
        else /* fat 16 or 12 */
 -              sbi->vol_id = (((u32)b->fat16.vol_id[0]) |
 -                                      ((u32)b->fat16.vol_id[1] << 8) |
 -                                      ((u32)b->fat16.vol_id[2] << 16) |
 -                                      ((u32)b->fat16.vol_id[3] << 24));
 +              sbi->vol_id = bpb.fat16_vol_id;
  
        sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
        sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
  
        sbi->dir_start = sbi->fat_start + sbi->fats * sbi->fat_length;
 -      sbi->dir_entries = get_unaligned_le16(&b->dir_entries);
 +      sbi->dir_entries = bpb.fat_dir_entries;
        if (sbi->dir_entries & (sbi->dir_per_block - 1)) {
                if (!silent)
                        fat_msg(sb, KERN_ERR, "bogus directory-entries per block"
                               " (%u)", sbi->dir_entries);
 -              brelse(bh);
                goto out_invalid;
        }
  
        rootdir_sectors = sbi->dir_entries
                * sizeof(struct msdos_dir_entry) / sb->s_blocksize;
        sbi->data_start = sbi->dir_start + rootdir_sectors;
 -      total_sectors = get_unaligned_le16(&b->sectors);
 +      total_sectors = bpb.fat_sectors;
        if (total_sectors == 0)
 -              total_sectors = le32_to_cpu(b->total_sect);
 +              total_sectors = bpb.fat_total_sect;
  
        total_clusters = (total_sectors - sbi->data_start) / sbi->sec_per_clus;
  
  
        /* some OSes set FAT_STATE_DIRTY and clean it on unmount. */
        if (sbi->fat_bits == 32)
 -              sbi->dirty = b->fat32.state & FAT_STATE_DIRTY;
 +              sbi->dirty = bpb.fat32_state & FAT_STATE_DIRTY;
        else /* fat 16 or 12 */
 -              sbi->dirty = b->fat16.state & FAT_STATE_DIRTY;
 +              sbi->dirty = bpb.fat16_state & FAT_STATE_DIRTY;
  
        /* check that FAT table does not overflow */
        fat_clusters = calc_fat_clusters(sb);
                if (!silent)
                        fat_msg(sb, KERN_ERR, "count of clusters too big (%u)",
                               total_clusters);
 -              brelse(bh);
                goto out_invalid;
        }
  
        if (sbi->prev_free < FAT_START_ENT)
                sbi->prev_free = FAT_START_ENT;
  
 -      brelse(bh);
 -
        /* set up enough so that it can read an inode */
        fat_hash_init(sb);
        dir_hash_init(sb);
diff --combined fs/file_table.c
index 40bf4660f0a3aa18bf881c2b3c09dea95e0808ec,f8cc881fbbfb3ff7fca0ea2bc589f0ab8a9f48db..385bfd31512a17f4e4c6869a3ee8f32c456cd327
@@@ -76,14 -76,14 +76,14 @@@ EXPORT_SYMBOL_GPL(get_max_files)
   * Handle nr_files sysctl
   */
  #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
 -int proc_nr_files(ctl_table *table, int write,
 +int proc_nr_files(struct ctl_table *table, int write,
                       void __user *buffer, size_t *lenp, loff_t *ppos)
  {
        files_stat.nr_files = get_nr_files();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
  }
  #else
 -int proc_nr_files(ctl_table *table, int write,
 +int proc_nr_files(struct ctl_table *table, int write,
                       void __user *buffer, size_t *lenp, loff_t *ppos)
  {
        return -ENOSYS;
@@@ -175,6 -175,12 +175,12 @@@ struct file *alloc_file(struct path *pa
        file->f_path = *path;
        file->f_inode = path->dentry->d_inode;
        file->f_mapping = path->dentry->d_inode->i_mapping;
+       if ((mode & FMODE_READ) &&
+            likely(fop->read || fop->aio_read || fop->read_iter))
+               mode |= FMODE_CAN_READ;
+       if ((mode & FMODE_WRITE) &&
+            likely(fop->write || fop->aio_write || fop->write_iter))
+               mode |= FMODE_CAN_WRITE;
        file->f_mode = mode;
        file->f_op = fop;
        if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
diff --combined fs/fuse/file.c
index 903cbc9cd6bd3a471f565e9fd3e2115539b58aca,b2dae9d1437cf36a9c61178166f9ff9f1de24ec1..6e16dad13e9b16de0358f8caaec9833d9f00a84b
@@@ -933,8 -933,7 +933,7 @@@ out
        return err;
  }
  
- static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-                                 unsigned long nr_segs, loff_t pos)
+ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
  {
        struct inode *inode = iocb->ki_filp->f_mapping->host;
        struct fuse_conn *fc = get_fuse_conn(inode);
         * i_size is up to date).
         */
        if (fc->auto_inval_data ||
-           (pos + iov_length(iov, nr_segs) > i_size_read(inode))) {
+           (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
                int err;
                err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
                if (err)
                        return err;
        }
  
-       return generic_file_aio_read(iocb, iov, nr_segs, pos);
+       return generic_file_read_iter(iocb, to);
  }
  
  static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
@@@ -1089,6 -1088,8 +1088,6 @@@ static ssize_t fuse_fill_write_pages(st
                tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
                flush_dcache_page(page);
  
 -              mark_page_accessed(page);
 -
                if (!tmp) {
                        unlock_page(page);
                        page_cache_release(page);
@@@ -1181,19 -1182,17 +1180,17 @@@ static ssize_t fuse_perform_write(struc
        return res > 0 ? res : err;
  }
  
- static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                                  unsigned long nr_segs, loff_t pos)
+ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  {
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
-       size_t count = 0;
-       size_t ocount = 0;
+       size_t count = iov_iter_count(from);
        ssize_t written = 0;
        ssize_t written_buffered = 0;
        struct inode *inode = mapping->host;
        ssize_t err;
-       struct iov_iter i;
        loff_t endbyte = 0;
+       loff_t pos = iocb->ki_pos;
  
        if (get_fuse_conn(inode)->writeback_cache) {
                /* Update size (EOF optimization) and mode (SUID clearing) */
                if (err)
                        return err;
  
-               return generic_file_aio_write(iocb, iov, nr_segs, pos);
+               return generic_file_write_iter(iocb, from);
        }
  
-       WARN_ON(iocb->ki_pos != pos);
-       ocount = 0;
-       err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-       if (err)
-               return err;
-       count = ocount;
        mutex_lock(&inode->i_mutex);
  
        /* We can write back this queue in page reclaim */
        if (count == 0)
                goto out;
  
+       iov_iter_truncate(from, count);
        err = file_remove_suid(file);
        if (err)
                goto out;
                goto out;
  
        if (file->f_flags & O_DIRECT) {
-               written = generic_file_direct_write(iocb, iov, &nr_segs, pos, 
-                                                   count, ocount);
-               if (written < 0 || written == count)
+               written = generic_file_direct_write(iocb, from, pos);
+               if (written < 0 || !iov_iter_count(from))
                        goto out;
  
                pos += written;
-               count -= written;
  
-               iov_iter_init(&i, iov, nr_segs, count, written);
-               written_buffered = fuse_perform_write(file, mapping, &i, pos);
+               written_buffered = fuse_perform_write(file, mapping, from, pos);
                if (written_buffered < 0) {
                        err = written_buffered;
                        goto out;
                written += written_buffered;
                iocb->ki_pos = pos + written_buffered;
        } else {
-               iov_iter_init(&i, iov, nr_segs, count, 0);
-               written = fuse_perform_write(file, mapping, &i, pos);
+               written = fuse_perform_write(file, mapping, from, pos);
                if (written >= 0)
                        iocb->ki_pos = pos + written;
        }
@@@ -1300,7 -1288,7 +1286,7 @@@ static int fuse_get_user_pages(struct f
        size_t nbytes = 0;  /* # bytes already packed in req */
  
        /* Special case for kernel I/O: can copy directly into the buffer */
-       if (segment_eq(get_fs(), KERNEL_DS)) {
+       if (ii->type & ITER_KVEC) {
                unsigned long user_addr = fuse_get_user_addr(ii);
                size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
  
  
        while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
                unsigned npages;
-               unsigned long user_addr = fuse_get_user_addr(ii);
-               unsigned offset = user_addr & ~PAGE_MASK;
-               size_t frag_size = fuse_get_frag_size(ii, *nbytesp - nbytes);
-               int ret;
+               size_t start;
                unsigned n = req->max_pages - req->num_pages;
-               frag_size = min_t(size_t, frag_size, n << PAGE_SHIFT);
-               npages = (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
-               npages = clamp(npages, 1U, n);
-               ret = get_user_pages_fast(user_addr, npages, !write,
-                                         &req->pages[req->num_pages]);
+               ssize_t ret = iov_iter_get_pages(ii,
+                                       &req->pages[req->num_pages],
+                                       n * PAGE_SIZE, &start);
                if (ret < 0)
                        return ret;
  
-               npages = ret;
-               frag_size = min_t(size_t, frag_size,
-                                 (npages << PAGE_SHIFT) - offset);
-               iov_iter_advance(ii, frag_size);
+               iov_iter_advance(ii, ret);
+               nbytes += ret;
+               ret += start;
+               npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
  
-               req->page_descs[req->num_pages].offset = offset;
+               req->page_descs[req->num_pages].offset = start;
                fuse_page_descs_length_init(req, req->num_pages, npages);
  
                req->num_pages += npages;
                req->page_descs[req->num_pages - 1].length -=
-                       (npages << PAGE_SHIFT) - offset - frag_size;
-               nbytes += frag_size;
+                       (PAGE_SIZE - ret) & (PAGE_SIZE - 1);
        }
  
        if (write)
  
  static inline int fuse_iter_npages(const struct iov_iter *ii_p)
  {
-       struct iov_iter ii = *ii_p;
-       int npages = 0;
-       while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) {
-               unsigned long user_addr = fuse_get_user_addr(&ii);
-               unsigned offset = user_addr & ~PAGE_MASK;
-               size_t frag_size = iov_iter_single_seg_count(&ii);
-               npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
-               iov_iter_advance(&ii, frag_size);
-       }
-       return min(npages, FUSE_MAX_PAGES_PER_REQ);
+       return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ);
  }
  
- ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
-                      unsigned long nr_segs, size_t count, loff_t *ppos,
-                      int flags)
+ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
+                      loff_t *ppos, int flags)
  {
        int write = flags & FUSE_DIO_WRITE;
        int cuse = flags & FUSE_DIO_CUSE;
        struct fuse_conn *fc = ff->fc;
        size_t nmax = write ? fc->max_write : fc->max_read;
        loff_t pos = *ppos;
+       size_t count = iov_iter_count(iter);
        pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT;
        pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
        ssize_t res = 0;
        struct fuse_req *req;
-       struct iov_iter ii;
-       iov_iter_init(&ii, iov, nr_segs, count, 0);
  
        if (io->async)
-               req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii));
+               req = fuse_get_req_for_background(fc, fuse_iter_npages(iter));
        else
-               req = fuse_get_req(fc, fuse_iter_npages(&ii));
+               req = fuse_get_req(fc, fuse_iter_npages(iter));
        if (IS_ERR(req))
                return PTR_ERR(req);
  
                size_t nres;
                fl_owner_t owner = current->files;
                size_t nbytes = min(count, nmax);
-               int err = fuse_get_user_pages(req, &ii, &nbytes, write);
+               int err = fuse_get_user_pages(req, iter, &nbytes, write);
                if (err) {
                        res = err;
                        break;
                        fuse_put_request(fc, req);
                        if (io->async)
                                req = fuse_get_req_for_background(fc,
-                                       fuse_iter_npages(&ii));
+                                       fuse_iter_npages(iter));
                        else
-                               req = fuse_get_req(fc, fuse_iter_npages(&ii));
+                               req = fuse_get_req(fc, fuse_iter_npages(iter));
                        if (IS_ERR(req))
                                break;
                }
  EXPORT_SYMBOL_GPL(fuse_direct_io);
  
  static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
-                                 const struct iovec *iov,
-                                 unsigned long nr_segs, loff_t *ppos,
-                                 size_t count)
+                                 struct iov_iter *iter,
+                                 loff_t *ppos)
  {
        ssize_t res;
        struct file *file = io->file;
        if (is_bad_inode(inode))
                return -EIO;
  
-       res = fuse_direct_io(io, iov, nr_segs, count, ppos, 0);
+       res = fuse_direct_io(io, iter, ppos, 0);
  
        fuse_invalidate_attr(inode);
  
@@@ -1483,22 -1446,26 +1444,26 @@@ static ssize_t fuse_direct_read(struct 
  {
        struct fuse_io_priv io = { .async = 0, .file = file };
        struct iovec iov = { .iov_base = buf, .iov_len = count };
-       return __fuse_direct_read(&io, &iov, 1, ppos, count);
+       struct iov_iter ii;
+       iov_iter_init(&ii, READ, &iov, 1, count);
+       return __fuse_direct_read(&io, &ii, ppos);
  }
  
  static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
-                                  const struct iovec *iov,
-                                  unsigned long nr_segs, loff_t *ppos)
+                                  struct iov_iter *iter,
+                                  loff_t *ppos)
  {
        struct file *file = io->file;
        struct inode *inode = file_inode(file);
-       size_t count = iov_length(iov, nr_segs);
+       size_t count = iov_iter_count(iter);
        ssize_t res;
  
        res = generic_write_checks(file, ppos, &count, 0);
-       if (!res)
-               res = fuse_direct_io(io, iov, nr_segs, count, ppos,
-                                    FUSE_DIO_WRITE);
+       if (!res) {
+               iov_iter_truncate(iter, count);
+               res = fuse_direct_io(io, iter, ppos, FUSE_DIO_WRITE);
+       }
  
        fuse_invalidate_attr(inode);
  
@@@ -1512,13 -1479,15 +1477,15 @@@ static ssize_t fuse_direct_write(struc
        struct inode *inode = file_inode(file);
        ssize_t res;
        struct fuse_io_priv io = { .async = 0, .file = file };
+       struct iov_iter ii;
+       iov_iter_init(&ii, WRITE, &iov, 1, count);
  
        if (is_bad_inode(inode))
                return -EIO;
  
        /* Don't allow parallel writes to the same file */
        mutex_lock(&inode->i_mutex);
-       res = __fuse_direct_write(&io, &iov, 1, ppos);
+       res = __fuse_direct_write(&io, &ii, ppos);
        if (res > 0)
                fuse_write_update_size(inode, *ppos);
        mutex_unlock(&inode->i_mutex);
@@@ -2302,6 -2271,7 +2269,6 @@@ static int fuse_file_flock(struct file 
                struct fuse_file *ff = file->private_data;
  
                /* emulate flock with POSIX locks */
 -              fl->fl_owner = (fl_owner_t) file;
                ff->flock = true;
                err = fuse_setlk(file, fl, 1);
        }
@@@ -2372,7 -2342,7 +2339,7 @@@ static int fuse_ioctl_copy_user(struct 
        if (!bytes)
                return 0;
  
-       iov_iter_init(&ii, iov, nr_segs, bytes, 0);
+       iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes);
  
        while (iov_iter_count(&ii)) {
                struct page *page = pages[page_idx++];
@@@ -2894,8 -2864,8 +2861,8 @@@ static inline loff_t fuse_round_up(loff
  }
  
  static ssize_t
- fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-                       loff_t offset, unsigned long nr_segs)
+ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+                       loff_t offset)
  {
        ssize_t ret = 0;
        struct file *file = iocb->ki_filp;
        loff_t pos = 0;
        struct inode *inode;
        loff_t i_size;
-       size_t count = iov_length(iov, nr_segs);
+       size_t count = iov_iter_count(iter);
        struct fuse_io_priv *io;
  
        pos = offset;
                if (offset >= i_size)
                        return 0;
                count = min_t(loff_t, count, fuse_round_up(i_size - offset));
+               iov_iter_truncate(iter, count);
        }
  
        io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
                io->async = false;
  
        if (rw == WRITE)
-               ret = __fuse_direct_write(io, iov, nr_segs, &pos);
+               ret = __fuse_direct_write(io, iter, &pos);
        else
-               ret = __fuse_direct_read(io, iov, nr_segs, &pos, count);
+               ret = __fuse_direct_read(io, iter, &pos);
  
        if (io->async) {
                fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
  
  static const struct file_operations fuse_file_operations = {
        .llseek         = fuse_file_llseek,
-       .read           = do_sync_read,
-       .aio_read       = fuse_file_aio_read,
-       .write          = do_sync_write,
-       .aio_write      = fuse_file_aio_write,
+       .read           = new_sync_read,
+       .read_iter      = fuse_file_read_iter,
+       .write          = new_sync_write,
+       .write_iter     = fuse_file_write_iter,
        .mmap           = fuse_file_mmap,
        .open           = fuse_open,
        .flush          = fuse_flush,
diff --combined fs/gfs2/aops.c
index 492123cda64ab5d325db6a640d29d7640eeb6f10,910838951d66c375d9c7a2b38fd4a225c45245b0..805b37fed6383fc71abcb573de809ee8f3e41c53
@@@ -431,7 -431,7 +431,7 @@@ static int gfs2_jdata_writepages(struc
  
        ret = gfs2_write_cache_jdata(mapping, wbc);
        if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
 -              gfs2_log_flush(sdp, ip->i_gl);
 +              gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
                ret = gfs2_write_cache_jdata(mapping, wbc);
        }
        return ret;
@@@ -577,6 -577,7 +577,6 @@@ int gfs2_internal_read(struct gfs2_inod
                p = kmap_atomic(page);
                memcpy(buf + copied, p + offset, amt);
                kunmap_atomic(p);
 -              mark_page_accessed(page);
                page_cache_release(page);
                copied += amt;
                index++;
@@@ -1040,8 -1041,7 +1040,7 @@@ static int gfs2_ok_for_dio(struct gfs2_
  
  
  static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
-                             const struct iovec *iov, loff_t offset,
-                             unsigned long nr_segs)
+                             struct iov_iter *iter, loff_t offset)
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
         */
        if (mapping->nrpages) {
                loff_t lstart = offset & (PAGE_CACHE_SIZE - 1);
-               loff_t len = iov_length(iov, nr_segs);
+               loff_t len = iov_iter_count(iter);
                loff_t end = PAGE_ALIGN(offset + len) - 1;
  
                rv = 0;
                        truncate_inode_pages_range(mapping, lstart, end);
        }
  
-       rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-                                 offset, nr_segs, gfs2_get_block_direct,
-                                 NULL, NULL, 0);
+       rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+                                 iter, offset,
+                                 gfs2_get_block_direct, NULL, NULL, 0);
  out:
        gfs2_glock_dq(&gh);
        gfs2_holder_uninit(&gh);
diff --combined fs/gfs2/file.c
index 6ab0cfb2e891014436816e7ce2021a745291d2d6,01b4c5b1bff8ddd152a6b93ac3b5249d3d0a49a8..4fc3a3046174dc9a296c90a0d0ca6d53485e277b
@@@ -203,9 -203,9 +203,9 @@@ void gfs2_set_inode_flags(struct inode 
                             GFS2_DIF_INHERIT_JDATA)
  
  /**
 - * gfs2_set_flags - set flags on an inode
 - * @inode: The inode
 - * @flags: The flags to set
 + * do_gfs2_set_flags - set flags on an inode
 + * @filp: file pointer
 + * @reqflags: The flags to set
   * @mask: Indicates which flags are valid
   *
   */
@@@ -256,7 -256,7 +256,7 @@@ static int do_gfs2_set_flags(struct fil
        }
        if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
                if (flags & GFS2_DIF_JDATA)
 -                      gfs2_log_flush(sdp, ip->i_gl);
 +                      gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
                error = filemap_fdatawrite(inode->i_mapping);
                if (error)
                        goto out;
@@@ -318,7 -318,7 +318,7 @@@ static long gfs2_ioctl(struct file *fil
  
  /**
   * gfs2_size_hint - Give a hint to the size of a write request
 - * @file: The struct file
 + * @filep: The struct file
   * @offset: The file offset of the write
   * @size: The length of the write
   *
@@@ -371,7 -371,7 +371,7 @@@ static int gfs2_allocate_page_backing(s
  /**
   * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable
   * @vma: The virtual memory area
 - * @page: The page which is about to become writable
 + * @vmf: The virtual memory fault containing the page to become writable
   *
   * When the page becomes writable, we need to ensure that we have
   * blocks allocated on disk to back that page.
@@@ -684,7 -684,7 +684,7 @@@ static int gfs2_fsync(struct file *file
  }
  
  /**
-  * gfs2_file_aio_write - Perform a write to a file
+  * gfs2_file_write_iter - Perform a write to a file
   * @iocb: The io context
   * @iov: The data to write
   * @nr_segs: Number of @iov segments
   *
   */
  
- static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                                  unsigned long nr_segs, loff_t pos)
+ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  {
        struct file *file = iocb->ki_filp;
-       size_t writesize = iov_length(iov, nr_segs);
        struct gfs2_inode *ip = GFS2_I(file_inode(file));
        int ret;
  
        if (ret)
                return ret;
  
-       gfs2_size_hint(file, pos, writesize);
+       gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from));
  
        if (file->f_flags & O_APPEND) {
                struct gfs2_holder gh;
                gfs2_glock_dq_uninit(&gh);
        }
  
-       return generic_file_aio_write(iocb, iov, nr_segs, pos);
+       return generic_file_write_iter(iocb, from);
  }
  
  static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
@@@ -1058,10 -1056,10 +1056,10 @@@ static int gfs2_flock(struct file *file
  
  const struct file_operations gfs2_file_fops = {
        .llseek         = gfs2_llseek,
-       .read           = do_sync_read,
-       .aio_read       = generic_file_aio_read,
-       .write          = do_sync_write,
-       .aio_write      = gfs2_file_aio_write,
+       .read           = new_sync_read,
+       .read_iter      = generic_file_read_iter,
+       .write          = new_sync_write,
+       .write_iter     = gfs2_file_write_iter,
        .unlocked_ioctl = gfs2_ioctl,
        .mmap           = gfs2_mmap,
        .open           = gfs2_open,
        .lock           = gfs2_lock,
        .flock          = gfs2_flock,
        .splice_read    = generic_file_splice_read,
-       .splice_write   = generic_file_splice_write,
+       .splice_write   = iter_file_splice_write,
        .setlease       = gfs2_setlease,
        .fallocate      = gfs2_fallocate,
  };
@@@ -1090,17 -1088,17 +1088,17 @@@ const struct file_operations gfs2_dir_f
  
  const struct file_operations gfs2_file_fops_nolock = {
        .llseek         = gfs2_llseek,
-       .read           = do_sync_read,
-       .aio_read       = generic_file_aio_read,
-       .write          = do_sync_write,
-       .aio_write      = gfs2_file_aio_write,
+       .read           = new_sync_read,
+       .read_iter      = generic_file_read_iter,
+       .write          = new_sync_write,
+       .write_iter     = gfs2_file_write_iter,
        .unlocked_ioctl = gfs2_ioctl,
        .mmap           = gfs2_mmap,
        .open           = gfs2_open,
        .release        = gfs2_release,
        .fsync          = gfs2_fsync,
        .splice_read    = generic_file_splice_read,
-       .splice_write   = generic_file_splice_write,
+       .splice_write   = iter_file_splice_write,
        .setlease       = generic_setlease,
        .fallocate      = gfs2_fallocate,
  };
diff --combined fs/nfs/direct.c
index 4ad7bc3886791b0078ebc3ae4b326ed5e4c6566b,b122fe21fea0dce3ae5dbcbb362dcbb4820e73b8..8f98138cbc4385ba63b3af77ae907219d22e6991
@@@ -108,97 -108,6 +108,97 @@@ static inline int put_dreq(struct nfs_d
        return atomic_dec_and_test(&dreq->io_count);
  }
  
 +/*
 + * nfs_direct_select_verf - select the right verifier
 + * @dreq - direct request possibly spanning multiple servers
 + * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
 + * @ds_idx - index of data server in data server list, only valid if ds_clp set
 + *
 + * returns the correct verifier to use given the role of the server
 + */
 +static struct nfs_writeverf *
 +nfs_direct_select_verf(struct nfs_direct_req *dreq,
 +                     struct nfs_client *ds_clp,
 +                     int ds_idx)
 +{
 +      struct nfs_writeverf *verfp = &dreq->verf;
 +
 +#ifdef CONFIG_NFS_V4_1
 +      if (ds_clp) {
 +              /* pNFS is in use, use the DS verf */
 +              if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets)
 +                      verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf;
 +              else
 +                      WARN_ON_ONCE(1);
 +      }
 +#endif
 +      return verfp;
 +}
 +
 +
 +/*
 + * nfs_direct_set_hdr_verf - set the write/commit verifier
 + * @dreq - direct request possibly spanning multiple servers
 + * @hdr - pageio header to validate against previously seen verfs
 + *
 + * Set the server's (MDS or DS) "seen" verifier
 + */
 +static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
 +                                  struct nfs_pgio_header *hdr)
 +{
 +      struct nfs_writeverf *verfp;
 +
 +      verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
 +                                    hdr->data->ds_idx);
 +      WARN_ON_ONCE(verfp->committed >= 0);
 +      memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
 +      WARN_ON_ONCE(verfp->committed < 0);
 +}
 +
 +/*
 + * nfs_direct_cmp_hdr_verf - compare verifier for pgio header
 + * @dreq - direct request possibly spanning multiple servers
 + * @hdr - pageio header to validate against previously seen verf
 + *
 + * set the server's "seen" verf if not initialized.
 + * returns result of comparison between @hdr->verf and the "seen"
 + * verf of the server used by @hdr (DS or MDS)
 + */
 +static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
 +                                        struct nfs_pgio_header *hdr)
 +{
 +      struct nfs_writeverf *verfp;
 +
 +      verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
 +                                       hdr->data->ds_idx);
 +      if (verfp->committed < 0) {
 +              nfs_direct_set_hdr_verf(dreq, hdr);
 +              return 0;
 +      }
 +      return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
 +}
 +
 +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 +/*
 + * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
 + * @dreq - direct request possibly spanning multiple servers
 + * @data - commit data to validate against previously seen verf
 + *
 + * returns result of comparison between @data->verf and the verf of
 + * the server used by @data (DS or MDS)
 + */
 +static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
 +                                         struct nfs_commit_data *data)
 +{
 +      struct nfs_writeverf *verfp;
 +
 +      verfp = nfs_direct_select_verf(dreq, data->ds_clp,
 +                                       data->ds_commit_index);
 +      WARN_ON_ONCE(verfp->committed < 0);
 +      return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
 +}
 +#endif
 +
  /**
   * nfs_direct_IO - NFS address space operation for direct I/O
   * @rw: direction (read or write)
   * shunt off direct read and write requests before the VFS gets them,
   * so this method is only ever called for swap.
   */
- ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
+ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
  {
  #ifndef CONFIG_NFS_SWAP
        dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n",
-                       iocb->ki_filp, (long long) pos, nr_segs);
+                       iocb->ki_filp, (long long) pos, iter->nr_segs);
  
        return -EINVAL;
  #else
        VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
  
        if (rw == READ || rw == KERNEL_READ)
-               return nfs_file_direct_read(iocb, iov, nr_segs, pos,
+               return nfs_file_direct_read(iocb, iter, pos,
                                rw == READ ? true : false);
-       return nfs_file_direct_write(iocb, iov, nr_segs, pos,
+       return nfs_file_direct_write(iocb, iter, pos,
                                rw == WRITE ? true : false);
  #endif /* CONFIG_NFS_SWAP */
  }
@@@ -259,7 -168,6 +259,7 @@@ static inline struct nfs_direct_req *nf
        kref_get(&dreq->kref);
        init_completion(&dreq->completion);
        INIT_LIST_HEAD(&dreq->mds_cinfo.list);
 +      dreq->verf.committed = NFS_INVALID_STABLE_HOW;  /* not set yet */
        INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
        spin_lock_init(&dreq->lock);
  
@@@ -414,65 -322,43 +414,42 @@@ static const struct nfs_pgio_completion
   * handled automatically by nfs_direct_read_result().  Otherwise, if
   * no requests have been sent, just return an error.
   */
- static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc,
-                                               const struct iovec *iov,
-                                               loff_t pos, bool uio)
- {
-       struct nfs_direct_req *dreq = desc->pg_dreq;
-       struct nfs_open_context *ctx = dreq->ctx;
-       struct inode *inode = ctx->dentry->d_inode;
-       unsigned long user_addr = (unsigned long)iov->iov_base;
-       size_t count = iov->iov_len;
-       size_t rsize = NFS_SERVER(inode)->rsize;
-       unsigned int pgbase;
-       int result;
-       ssize_t started = 0;
-       struct page **pagevec = NULL;
-       unsigned int npages;
-       do {
-               size_t bytes;
-               int i;
  
-               pgbase = user_addr & ~PAGE_MASK;
-               bytes = min(max_t(size_t, rsize, PAGE_SIZE), count);
+ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
+                                             struct iov_iter *iter,
+                                             loff_t pos)
+ {
+       struct nfs_pageio_descriptor desc;
+       struct inode *inode = dreq->inode;
+       ssize_t result = -EINVAL;
+       size_t requested_bytes = 0;
+       size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
  
-               result = -ENOMEM;
-               npages = nfs_page_array_len(pgbase, bytes);
-               if (!pagevec)
-                       pagevec = kmalloc(npages * sizeof(struct page *),
-                                         GFP_KERNEL);
-               if (!pagevec)
-                       break;
-               if (uio) {
-                       down_read(&current->mm->mmap_sem);
-                       result = get_user_pages(current, current->mm, user_addr,
-                                       npages, 1, 0, pagevec, NULL);
-                       up_read(&current->mm->mmap_sem);
-                       if (result < 0)
-                               break;
-               } else {
-                       WARN_ON(npages != 1);
-                       result = get_kernel_page(user_addr, 1, pagevec);
-                       if (WARN_ON(result != 1))
-                               break;
-               }
 -      NFS_PROTO(dreq->inode)->read_pageio_init(&desc, dreq->inode,
++      nfs_pageio_init_read(&desc, dreq->inode, false,
+                            &nfs_direct_read_completion_ops);
+       get_dreq(dreq);
+       desc.pg_dreq = dreq;
+       atomic_inc(&inode->i_dio_count);
  
-               if ((unsigned)result < npages) {
-                       bytes = result * PAGE_SIZE;
-                       if (bytes <= pgbase) {
-                               nfs_direct_release_pages(pagevec, result);
-                               break;
-                       }
-                       bytes -= pgbase;
-                       npages = result;
-               }
+       while (iov_iter_count(iter)) {
+               struct page **pagevec;
+               size_t bytes;
+               size_t pgbase;
+               unsigned npages, i;
  
+               result = iov_iter_get_pages_alloc(iter, &pagevec, 
+                                                 rsize, &pgbase);
+               if (result < 0)
+                       break;
+       
+               bytes = result;
+               iov_iter_advance(iter, bytes);
+               npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
                for (i = 0; i < npages; i++) {
                        struct nfs_page *req;
                        unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
                        /* XXX do we need to do the eof zeroing found in async_filler? */
 -                      req = nfs_create_request(dreq->ctx, dreq->inode,
 -                                               pagevec[i],
 +                      req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
                                                 pgbase, req_len);
                        if (IS_ERR(req)) {
                                result = PTR_ERR(req);
                        }
                        req->wb_index = pos >> PAGE_SHIFT;
                        req->wb_offset = pos & ~PAGE_MASK;
-                       if (!nfs_pageio_add_request(desc, req)) {
-                               result = desc->pg_error;
+                       if (!nfs_pageio_add_request(&desc, req)) {
+                               result = desc.pg_error;
                                nfs_release_request(req);
                                break;
                        }
                        pgbase = 0;
                        bytes -= req_len;
-                       started += req_len;
-                       user_addr += req_len;
+                       requested_bytes += req_len;
                        pos += req_len;
-                       count -= req_len;
                        dreq->bytes_left -= req_len;
                }
-               /* The nfs_page now hold references to these pages */
                nfs_direct_release_pages(pagevec, npages);
-       } while (count != 0 && result >= 0);
-       kfree(pagevec);
-       if (started)
-               return started;
-       return result < 0 ? (ssize_t) result : -EFAULT;
- }
- static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
-                                             const struct iovec *iov,
-                                             unsigned long nr_segs,
-                                             loff_t pos, bool uio)
- {
-       struct nfs_pageio_descriptor desc;
-       struct inode *inode = dreq->inode;
-       ssize_t result = -EINVAL;
-       size_t requested_bytes = 0;
-       unsigned long seg;
-       nfs_pageio_init_read(&desc, dreq->inode, false,
-                            &nfs_direct_read_completion_ops);
-       get_dreq(dreq);
-       desc.pg_dreq = dreq;
-       atomic_inc(&inode->i_dio_count);
-       for (seg = 0; seg < nr_segs; seg++) {
-               const struct iovec *vec = &iov[seg];
-               result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio);
+               kvfree(pagevec);
                if (result < 0)
                        break;
-               requested_bytes += result;
-               if ((size_t)result < vec->iov_len)
-                       break;
-               pos += vec->iov_len;
        }
  
        nfs_pageio_complete(&desc);
  /**
   * nfs_file_direct_read - file direct read operation for NFS files
   * @iocb: target I/O control block
-  * @iov: vector of user buffers into which to read data
-  * @nr_segs: size of iov vector
+  * @iter: vector of user buffers into which to read data
   * @pos: byte offset in file where reading starts
   *
   * We use this function for direct reads instead of calling
   * client must read the updated atime from the server back into its
   * cache.
   */
- ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
-                               unsigned long nr_segs, loff_t pos, bool uio)
+ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
+                               loff_t pos, bool uio)
  {
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct nfs_direct_req *dreq;
        struct nfs_lock_context *l_ctx;
        ssize_t result = -EINVAL;
-       size_t count;
-       count = iov_length(iov, nr_segs);
+       size_t count = iov_iter_count(iter);
        nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
  
        dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
                goto out_unlock;
  
        dreq->inode = inode;
-       dreq->bytes_left = iov_length(iov, nr_segs);
+       dreq->bytes_left = count;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
        l_ctx = nfs_get_lock_context(dreq->ctx);
        if (IS_ERR(l_ctx)) {
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
  
-       NFS_I(inode)->read_io += iov_length(iov, nr_segs);
-       result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+       NFS_I(inode)->read_io += count;
+       result = nfs_direct_read_schedule_iovec(dreq, iter, pos);
  
        mutex_unlock(&inode->i_mutex);
  
@@@ -655,7 -503,7 +594,7 @@@ static void nfs_direct_write_reschedule
        dreq->count = 0;
        get_dreq(dreq);
  
 -      NFS_PROTO(dreq->inode)->write_pageio_init(&desc, dreq->inode, FLUSH_STABLE,
 +      nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
                              &nfs_direct_write_completion_ops);
        desc.pg_dreq = dreq;
  
@@@ -694,7 -542,7 +633,7 @@@ static void nfs_direct_commit_complete(
                dprintk("NFS: %5u commit failed with error %d.\n",
                        data->task.tk_pid, status);
                dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 -      } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
 +      } else if (nfs_direct_cmp_commit_data_verf(dreq, data)) {
                dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
                dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
        }
@@@ -772,108 -620,6 +711,6 @@@ static void nfs_direct_write_complete(s
  }
  #endif
  
- /*
-  * NB: Return the value of the first error return code.  Subsequent
-  *     errors after the first one are ignored.
-  */
- /*
-  * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
-  * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
-  * bail and stop sending more writes.  Write length accounting is
-  * handled automatically by nfs_direct_write_result().  Otherwise, if
-  * no requests have been sent, just return an error.
-  */
- static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc,
-                                                const struct iovec *iov,
-                                                loff_t pos, bool uio)
- {
-       struct nfs_direct_req *dreq = desc->pg_dreq;
-       struct nfs_open_context *ctx = dreq->ctx;
-       struct inode *inode = ctx->dentry->d_inode;
-       unsigned long user_addr = (unsigned long)iov->iov_base;
-       size_t count = iov->iov_len;
-       size_t wsize = NFS_SERVER(inode)->wsize;
-       unsigned int pgbase;
-       int result;
-       ssize_t started = 0;
-       struct page **pagevec = NULL;
-       unsigned int npages;
-       do {
-               size_t bytes;
-               int i;
-               pgbase = user_addr & ~PAGE_MASK;
-               bytes = min(max_t(size_t, wsize, PAGE_SIZE), count);
-               result = -ENOMEM;
-               npages = nfs_page_array_len(pgbase, bytes);
-               if (!pagevec)
-                       pagevec = kmalloc(npages * sizeof(struct page *), GFP_KERNEL);
-               if (!pagevec)
-                       break;
-               if (uio) {
-                       down_read(&current->mm->mmap_sem);
-                       result = get_user_pages(current, current->mm, user_addr,
-                                               npages, 0, 0, pagevec, NULL);
-                       up_read(&current->mm->mmap_sem);
-                       if (result < 0)
-                               break;
-               } else {
-                       WARN_ON(npages != 1);
-                       result = get_kernel_page(user_addr, 0, pagevec);
-                       if (WARN_ON(result != 1))
-                               break;
-               }
-               if ((unsigned)result < npages) {
-                       bytes = result * PAGE_SIZE;
-                       if (bytes <= pgbase) {
-                               nfs_direct_release_pages(pagevec, result);
-                               break;
-                       }
-                       bytes -= pgbase;
-                       npages = result;
-               }
-               for (i = 0; i < npages; i++) {
-                       struct nfs_page *req;
-                       unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
-                       req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
-                                                pgbase, req_len);
-                       if (IS_ERR(req)) {
-                               result = PTR_ERR(req);
-                               break;
-                       }
-                       nfs_lock_request(req);
-                       req->wb_index = pos >> PAGE_SHIFT;
-                       req->wb_offset = pos & ~PAGE_MASK;
-                       if (!nfs_pageio_add_request(desc, req)) {
-                               result = desc->pg_error;
-                               nfs_unlock_and_release_request(req);
-                               break;
-                       }
-                       pgbase = 0;
-                       bytes -= req_len;
-                       started += req_len;
-                       user_addr += req_len;
-                       pos += req_len;
-                       count -= req_len;
-                       dreq->bytes_left -= req_len;
-               }
-               /* The nfs_page now hold references to these pages */
-               nfs_direct_release_pages(pagevec, npages);
-       } while (count != 0 && result >= 0);
-       kfree(pagevec);
-       if (started)
-               return started;
-       return result < 0 ? (ssize_t) result : -EFAULT;
- }
  static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
  {
        struct nfs_direct_req *dreq = hdr->dreq;
                        if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
                                bit = NFS_IOHDR_NEED_RESCHED;
                        else if (dreq->flags == 0) {
 -                              memcpy(&dreq->verf, hdr->verf,
 -                                     sizeof(dreq->verf));
 +                              nfs_direct_set_hdr_verf(dreq, hdr);
                                bit = NFS_IOHDR_NEED_COMMIT;
                                dreq->flags = NFS_ODIRECT_DO_COMMIT;
                        } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
 -                              if (memcmp(&dreq->verf, hdr->verf, sizeof(dreq->verf))) {
 -                                      dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 +                              if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) {
 +                                      dreq->flags =
 +                                              NFS_ODIRECT_RESCHED_WRITES;
                                        bit = NFS_IOHDR_NEED_RESCHED;
                                } else
                                        bit = NFS_IOHDR_NEED_COMMIT;
        spin_unlock(&dreq->lock);
  
        while (!list_empty(&hdr->pages)) {
 +              bool do_destroy = true;
 +
                req = nfs_list_entry(hdr->pages.next);
                nfs_list_remove_request(req);
                switch (bit) {
                case NFS_IOHDR_NEED_COMMIT:
                        kref_get(&req->wb_kref);
                        nfs_mark_request_commit(req, hdr->lseg, &cinfo);
 +                      do_destroy = false;
                }
                nfs_unlock_and_release_request(req);
        }
@@@ -956,33 -699,78 +793,77 @@@ static const struct nfs_pgio_completion
        .completion = nfs_direct_write_completion,
  };
  
+ /*
+  * NB: Return the value of the first error return code.  Subsequent
+  *     errors after the first one are ignored.
+  */
+ /*
+  * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
+  * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
+  * bail and stop sending more writes.  Write length accounting is
+  * handled automatically by nfs_direct_write_result().  Otherwise, if
+  * no requests have been sent, just return an error.
+  */
  static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
-                                              const struct iovec *iov,
-                                              unsigned long nr_segs,
-                                              loff_t pos, bool uio)
+                                              struct iov_iter *iter,
+                                              loff_t pos)
  {
        struct nfs_pageio_descriptor desc;
        struct inode *inode = dreq->inode;
        ssize_t result = 0;
        size_t requested_bytes = 0;
-       unsigned long seg;
+       size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
  
 -      NFS_PROTO(inode)->write_pageio_init(&desc, inode, FLUSH_COND_STABLE,
 +      nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false,
                              &nfs_direct_write_completion_ops);
        desc.pg_dreq = dreq;
        get_dreq(dreq);
        atomic_inc(&inode->i_dio_count);
  
-       NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs);
-       for (seg = 0; seg < nr_segs; seg++) {
-               const struct iovec *vec = &iov[seg];
-               result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio);
+       NFS_I(inode)->write_io += iov_iter_count(iter);
+       while (iov_iter_count(iter)) {
+               struct page **pagevec;
+               size_t bytes;
+               size_t pgbase;
+               unsigned npages, i;
+               result = iov_iter_get_pages_alloc(iter, &pagevec, 
+                                                 wsize, &pgbase);
                if (result < 0)
                        break;
-               requested_bytes += result;
-               if ((size_t)result < vec->iov_len)
+               bytes = result;
+               iov_iter_advance(iter, bytes);
+               npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
+               for (i = 0; i < npages; i++) {
+                       struct nfs_page *req;
+                       unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
 -                      req = nfs_create_request(dreq->ctx, inode,
 -                                               pagevec[i],
++                      req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
+                                                pgbase, req_len);
+                       if (IS_ERR(req)) {
+                               result = PTR_ERR(req);
+                               break;
+                       }
+                       nfs_lock_request(req);
+                       req->wb_index = pos >> PAGE_SHIFT;
+                       req->wb_offset = pos & ~PAGE_MASK;
+                       if (!nfs_pageio_add_request(&desc, req)) {
+                               result = desc.pg_error;
+                               nfs_unlock_and_release_request(req);
+                               break;
+                       }
+                       pgbase = 0;
+                       bytes -= req_len;
+                       requested_bytes += req_len;
+                       pos += req_len;
+                       dreq->bytes_left -= req_len;
+               }
+               nfs_direct_release_pages(pagevec, npages);
+               kvfree(pagevec);
+               if (result < 0)
                        break;
-               pos += vec->iov_len;
        }
        nfs_pageio_complete(&desc);
  
  /**
   * nfs_file_direct_write - file direct write operation for NFS files
   * @iocb: target I/O control block
-  * @iov: vector of user buffers from which to write data
-  * @nr_segs: size of iov vector
+  * @iter: vector of user buffers from which to write data
   * @pos: byte offset in file where writing starts
   *
   * We use this function for direct writes instead of calling
   * Note that O_APPEND is not supported for NFS direct writes, as there
   * is no atomic O_APPEND write facility in the NFS protocol.
   */
- ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
-                               unsigned long nr_segs, loff_t pos, bool uio)
+ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
+                               loff_t pos, bool uio)
  {
        ssize_t result = -EINVAL;
        struct file *file = iocb->ki_filp;
        struct nfs_direct_req *dreq;
        struct nfs_lock_context *l_ctx;
        loff_t end;
-       size_t count;
-       count = iov_length(iov, nr_segs);
+       size_t count = iov_iter_count(iter);
        end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
  
        nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
  
-       result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+       result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
  
        if (mapping->nrpages) {
                invalidate_inode_pages2_range(mapping,
diff --combined fs/nfs/file.c
index c1edf7336315c3f8ddffe45261d814b8f1877771,f4ae5d0525e25e38cb4bffdd8a2d15c911247e60..4042ff58fe3f3d0b18d705774c3f6d975e642248
@@@ -165,22 -165,21 +165,21 @@@ nfs_file_flush(struct file *file, fl_ow
  EXPORT_SYMBOL_GPL(nfs_file_flush);
  
  ssize_t
- nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
-               unsigned long nr_segs, loff_t pos)
+ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
  {
        struct inode *inode = file_inode(iocb->ki_filp);
        ssize_t result;
  
        if (iocb->ki_filp->f_flags & O_DIRECT)
-               return nfs_file_direct_read(iocb, iov, nr_segs, pos, true);
+               return nfs_file_direct_read(iocb, to, iocb->ki_pos, true);
  
-       dprintk("NFS: read(%pD2, %lu@%lu)\n",
+       dprintk("NFS: read(%pD2, %zu@%lu)\n",
                iocb->ki_filp,
-               (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
+               iov_iter_count(to), (unsigned long) iocb->ki_pos);
  
        result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
        if (!result) {
-               result = generic_file_aio_read(iocb, iov, nr_segs, pos);
+               result = generic_file_read_iter(iocb, to);
                if (result > 0)
                        nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
        }
@@@ -635,24 -634,24 +634,24 @@@ static int nfs_need_sync_write(struct f
        return 0;
  }
  
- ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
-                      unsigned long nr_segs, loff_t pos)
+ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        unsigned long written = 0;
        ssize_t result;
-       size_t count = iov_length(iov, nr_segs);
+       size_t count = iov_iter_count(from);
+       loff_t pos = iocb->ki_pos;
  
        result = nfs_key_timeout_notify(file, inode);
        if (result)
                return result;
  
        if (file->f_flags & O_DIRECT)
-               return nfs_file_direct_write(iocb, iov, nr_segs, pos, true);
+               return nfs_file_direct_write(iocb, from, pos, true);
  
-       dprintk("NFS: write(%pD2, %lu@%Ld)\n",
-               file, (unsigned long) count, (long long) pos);
+       dprintk("NFS: write(%pD2, %zu@%Ld)\n",
+               file, count, (long long) pos);
  
        result = -EBUSY;
        if (IS_SWAPFILE(inode))
        if (!count)
                goto out;
  
-       result = generic_file_aio_write(iocb, iov, nr_segs, pos);
+       result = generic_file_write_iter(iocb, from);
        if (result > 0)
                written = result;
  
@@@ -691,36 -690,6 +690,6 @@@ out_swapfile
  }
  EXPORT_SYMBOL_GPL(nfs_file_write);
  
- ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
-                             struct file *filp, loff_t *ppos,
-                             size_t count, unsigned int flags)
- {
-       struct inode *inode = file_inode(filp);
-       unsigned long written = 0;
-       ssize_t ret;
-       dprintk("NFS splice_write(%pD2, %lu@%llu)\n",
-               filp, (unsigned long) count, (unsigned long long) *ppos);
-       /*
-        * The combination of splice and an O_APPEND destination is disallowed.
-        */
-       ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
-       if (ret > 0)
-               written = ret;
-       if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
-               int err = vfs_fsync(filp, 0);
-               if (err < 0)
-                       ret = err;
-       }
-       if (ret > 0)
-               nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
-       return ret;
- }
- EXPORT_SYMBOL_GPL(nfs_file_splice_write);
  static int
  do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
  {
@@@ -916,6 -885,10 +885,6 @@@ int nfs_flock(struct file *filp, int cm
                is_local = 1;
  
        /* We're simulating flock() locks using posix locks on the server */
 -      fl->fl_owner = (fl_owner_t)filp;
 -      fl->fl_start = 0;
 -      fl->fl_end = OFFSET_MAX;
 -
        if (fl->fl_type == F_UNLCK)
                return do_unlk(filp, cmd, fl, is_local);
        return do_setlk(filp, cmd, fl, is_local);
@@@ -935,10 -908,10 +904,10 @@@ EXPORT_SYMBOL_GPL(nfs_setlease)
  
  const struct file_operations nfs_file_operations = {
        .llseek         = nfs_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = nfs_file_read,
-       .aio_write      = nfs_file_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = nfs_file_read,
+       .write_iter     = nfs_file_write,
        .mmap           = nfs_file_mmap,
        .open           = nfs_file_open,
        .flush          = nfs_file_flush,
        .lock           = nfs_lock,
        .flock          = nfs_flock,
        .splice_read    = nfs_file_splice_read,
-       .splice_write   = nfs_file_splice_write,
+       .splice_write   = iter_file_splice_write,
        .check_flags    = nfs_check_flags,
        .setlease       = nfs_setlease,
  };
diff --combined fs/nfs/internal.h
index 8b69cba1bb04d9b177bca18a2f95c7b0162b8cf1,0e4e8049c9f5318bed90a4ed939398fa9e47cdf8..82ddbf46660e3c1be7d499f2ca014ce619da8603
@@@ -231,20 -231,13 +231,20 @@@ extern void nfs_destroy_writepagecache(
  
  extern int __init nfs_init_directcache(void);
  extern void nfs_destroy_directcache(void);
 -extern bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount);
  extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
                              struct nfs_pgio_header *hdr,
                              void (*release)(struct nfs_pgio_header *hdr));
  void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
  int nfs_iocounter_wait(struct nfs_io_counter *c);
  
 +extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
 +struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *);
 +void nfs_rw_header_free(struct nfs_pgio_header *);
 +void nfs_pgio_data_release(struct nfs_pgio_data *);
 +int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
 +int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *,
 +                    const struct rpc_call_ops *, int, int);
 +
  static inline void nfs_iocounter_init(struct nfs_io_counter *c)
  {
        c->flags = 0;
@@@ -327,16 -320,14 +327,14 @@@ int nfs_rename(struct inode *, struct d
  int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
  loff_t nfs_file_llseek(struct file *, loff_t, int);
  int nfs_file_flush(struct file *, fl_owner_t);
- ssize_t nfs_file_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
  ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
                             size_t, unsigned int);
  int nfs_file_mmap(struct file *, struct vm_area_struct *);
- ssize_t nfs_file_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ ssize_t nfs_file_write(struct kiocb *, struct iov_iter *);
  int nfs_file_release(struct inode *, struct file *);
  int nfs_lock(struct file *, int, struct file_lock *);
  int nfs_flock(struct file *, int, struct file_lock *);
- ssize_t nfs_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *,
-                             size_t, unsigned int);
  int nfs_check_flags(int);
  int nfs_setlease(struct file *, long, struct file_lock **);
  
@@@ -402,11 -393,19 +400,11 @@@ extern int nfs4_get_rootfh(struct nfs_s
  
  struct nfs_pgio_completion_ops;
  /* read.c */
 -extern struct nfs_read_header *nfs_readhdr_alloc(void);
 -extern void nfs_readhdr_free(struct nfs_pgio_header *hdr);
  extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 -                      struct inode *inode,
 +                      struct inode *inode, bool force_mds,
                        const struct nfs_pgio_completion_ops *compl_ops);
 -extern int nfs_initiate_read(struct rpc_clnt *clnt,
 -                           struct nfs_read_data *data,
 -                           const struct rpc_call_ops *call_ops, int flags);
  extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 -extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
 -                            struct nfs_pgio_header *hdr);
  extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 -extern void nfs_readdata_release(struct nfs_read_data *rdata);
  
  /* super.c */
  void nfs_clone_super(struct super_block *, struct nfs_mount_info *);
@@@ -421,10 -420,19 +419,10 @@@ int nfs_remount(struct super_block *sb
  
  /* write.c */
  extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 -                      struct inode *inode, int ioflags,
 +                      struct inode *inode, int ioflags, bool force_mds,
                        const struct nfs_pgio_completion_ops *compl_ops);
 -extern struct nfs_write_header *nfs_writehdr_alloc(void);
 -extern void nfs_writehdr_free(struct nfs_pgio_header *hdr);
 -extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
 -                           struct nfs_pgio_header *hdr);
  extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
 -extern void nfs_writedata_release(struct nfs_write_data *wdata);
  extern void nfs_commit_free(struct nfs_commit_data *p);
 -extern int nfs_initiate_write(struct rpc_clnt *clnt,
 -                            struct nfs_write_data *data,
 -                            const struct rpc_call_ops *call_ops,
 -                            int how, int flags);
  extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
  extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
  extern int nfs_initiate_commit(struct rpc_clnt *clnt,
@@@ -437,7 -445,6 +435,7 @@@ extern void nfs_init_commit(struct nfs_
                            struct nfs_commit_info *cinfo);
  int nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
                         struct nfs_commit_info *cinfo, int max);
 +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *);
  int nfs_scan_commit(struct inode *inode, struct list_head *dst,
                    struct nfs_commit_info *cinfo);
  void nfs_mark_request_commit(struct nfs_page *req,
@@@ -483,7 -490,7 +481,7 @@@ static inline void nfs_inode_dio_wait(s
  extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
  
  /* nfs4proc.c */
 -extern void __nfs4_read_done_cb(struct nfs_read_data *);
 +extern void __nfs4_read_done_cb(struct nfs_pgio_data *);
  extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
                            const struct rpc_timeout *timeparms,
                            const char *ip_addr);
diff --combined fs/nfs/nfs4file.c
index 464db9dd63180dc7baf3695f51471747426144fb,50de2cdea082580e1020903d2aa7e1a06644704f..a816f0627a6ce03cda2502c42c780c5ab6a2742c
@@@ -100,7 -100,8 +100,7 @@@ nfs4_file_fsync(struct file *file, loff
                        break;
                mutex_lock(&inode->i_mutex);
                ret = nfs_file_fsync_commit(file, start, end, datasync);
 -              if (!ret && !datasync)
 -                      /* application has asked for meta-data sync */
 +              if (!ret)
                        ret = pnfs_layoutcommit_inode(inode, true);
                mutex_unlock(&inode->i_mutex);
                /*
  
  const struct file_operations nfs4_file_operations = {
        .llseek         = nfs_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = nfs_file_read,
-       .aio_write      = nfs_file_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = nfs_file_read,
+       .write_iter     = nfs_file_write,
        .mmap           = nfs_file_mmap,
        .open           = nfs4_file_open,
        .flush          = nfs_file_flush,
        .lock           = nfs_lock,
        .flock          = nfs_flock,
        .splice_read    = nfs_file_splice_read,
-       .splice_write   = nfs_file_splice_write,
+       .splice_write   = iter_file_splice_write,
        .check_flags    = nfs_check_flags,
        .setlease       = nfs_setlease,
  };
diff --combined fs/ntfs/file.c
index 86ddab916b6607e3cab28c276359b8b98971a46c,89b4d6663775276b2a0229026b6b19bff46f2a27..5c9e2c81cb11db029ece7873766041ada8c65024
@@@ -2060,6 -2060,7 +2060,6 @@@ static ssize_t ntfs_file_buffered_write
                }
                do {
                        unlock_page(pages[--do_pages]);
 -                      mark_page_accessed(pages[do_pages]);
                        page_cache_release(pages[do_pages]);
                } while (do_pages);
                if (unlikely(status))
@@@ -2090,10 -2091,7 +2090,7 @@@ static ssize_t ntfs_file_aio_write_nolo
        size_t count;           /* after file limit checks */
        ssize_t written, err;
  
-       count = 0;
-       err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
-       if (err)
-               return err;
+       count = iov_length(iov, nr_segs);
        pos = *ppos;
        /* We can write back this queue in page reclaim. */
        current->backing_dev_info = mapping->backing_dev_info;
@@@ -2202,8 -2200,8 +2199,8 @@@ static int ntfs_file_fsync(struct file 
  
  const struct file_operations ntfs_file_ops = {
        .llseek         = generic_file_llseek,   /* Seek inside file. */
-       .read           = do_sync_read,          /* Read from file. */
-       .aio_read       = generic_file_aio_read, /* Async read from file. */
+       .read           = new_sync_read,         /* Read from file. */
+       .read_iter      = generic_file_read_iter, /* Async read from file. */
  #ifdef NTFS_RW
        .write          = do_sync_write,         /* Write to file. */
        .aio_write      = ntfs_file_aio_write,   /* Async write to file. */
diff --combined fs/ocfs2/file.c
index 8eb6e5732d3b73b115abea0681cdd95d9195becf,465c95016a39abed758984f0d360dae8c0396193..2930e231f3f9fbda2807190726d6ceffffa2a6b5
@@@ -828,7 -828,7 +828,7 @@@ static int ocfs2_write_zero_page(struc
                /*
                 * fs-writeback will release the dirty pages without page lock
                 * whose offset are over inode size, the release happens at
 -               * block_write_full_page_endio().
 +               * block_write_full_page().
                 */
                i_size_write(inode, abs_to);
                inode->i_blocks = ocfs2_inode_sector_count(inode);
        return ret;
  }
  
- static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
-                                   const struct iovec *iov,
-                                   unsigned long nr_segs,
-                                   loff_t pos)
+ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
+                                   struct iov_iter *from)
  {
        int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
        int can_do_direct, has_refcount = 0;
        ssize_t written = 0;
-       size_t ocount;          /* original count */
-       size_t count;           /* after file limit checks */
+       size_t count = iov_iter_count(from);
        loff_t old_size, *ppos = &iocb->ki_pos;
        u32 old_clusters;
        struct file *file = iocb->ki_filp;
                (unsigned long long)OCFS2_I(inode)->ip_blkno,
                file->f_path.dentry->d_name.len,
                file->f_path.dentry->d_name.name,
-               (unsigned int)nr_segs);
+               (unsigned int)from->nr_segs);   /* GRRRRR */
  
        if (iocb->ki_nbytes == 0)
                return 0;
@@@ -2354,29 -2351,21 +2351,21 @@@ relock
        /* communicate with ocfs2_dio_end_io */
        ocfs2_iocb_set_rw_locked(iocb, rw_level);
  
-       ret = generic_segment_checks(iov, &nr_segs, &ocount,
-                                    VERIFY_READ);
-       if (ret)
-               goto out_dio;
-       count = ocount;
        ret = generic_write_checks(file, ppos, &count,
                                   S_ISBLK(inode->i_mode));
        if (ret)
                goto out_dio;
  
+       iov_iter_truncate(from, count);
        if (direct_io) {
-               written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
-                                                   count, ocount);
+               written = generic_file_direct_write(iocb, from, *ppos);
                if (written < 0) {
                        ret = written;
                        goto out_dio;
                }
        } else {
-               struct iov_iter from;
-               iov_iter_init(&from, iov, nr_segs, count, 0);
                current->backing_dev_info = file->f_mapping->backing_dev_info;
-               written = generic_perform_write(file, &from, *ppos);
+               written = generic_perform_write(file, from, *ppos);
                if (likely(written >= 0))
                        iocb->ki_pos = *ppos + written;
                current->backing_dev_info = NULL;
@@@ -2441,84 -2430,6 +2430,6 @@@ out_sems
        return ret;
  }
  
- static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
-                               struct file *out,
-                               struct splice_desc *sd)
- {
-       int ret;
-       ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
-                                           sd->total_len, 0, NULL, NULL);
-       if (ret < 0) {
-               mlog_errno(ret);
-               return ret;
-       }
-       return splice_from_pipe_feed(pipe, sd, pipe_to_file);
- }
- static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
-                                      struct file *out,
-                                      loff_t *ppos,
-                                      size_t len,
-                                      unsigned int flags)
- {
-       int ret;
-       struct address_space *mapping = out->f_mapping;
-       struct inode *inode = mapping->host;
-       struct splice_desc sd = {
-               .total_len = len,
-               .flags = flags,
-               .pos = *ppos,
-               .u.file = out,
-       };
-       trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
-                       (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                       out->f_path.dentry->d_name.len,
-                       out->f_path.dentry->d_name.name, len);
-       pipe_lock(pipe);
-       splice_from_pipe_begin(&sd);
-       do {
-               ret = splice_from_pipe_next(pipe, &sd);
-               if (ret <= 0)
-                       break;
-               mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
-               ret = ocfs2_rw_lock(inode, 1);
-               if (ret < 0)
-                       mlog_errno(ret);
-               else {
-                       ret = ocfs2_splice_to_file(pipe, out, &sd);
-                       ocfs2_rw_unlock(inode, 1);
-               }
-               mutex_unlock(&inode->i_mutex);
-       } while (ret > 0);
-       splice_from_pipe_end(pipe, &sd);
-       pipe_unlock(pipe);
-       if (sd.num_spliced)
-               ret = sd.num_spliced;
-       if (ret > 0) {
-               int err;
-               err = generic_write_sync(out, *ppos, ret);
-               if (err)
-                       ret = err;
-               else
-                       *ppos += ret;
-               balance_dirty_pages_ratelimited(mapping);
-       }
-       return ret;
- }
  static ssize_t ocfs2_file_splice_read(struct file *in,
                                      loff_t *ppos,
                                      struct pipe_inode_info *pipe,
                        in->f_path.dentry->d_name.name, len);
  
        /*
-        * See the comment in ocfs2_file_aio_read()
+        * See the comment in ocfs2_file_read_iter()
         */
        ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level);
        if (ret < 0) {
@@@ -2549,10 -2460,8 +2460,8 @@@ bail
        return ret;
  }
  
- static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
-                                  const struct iovec *iov,
-                                  unsigned long nr_segs,
-                                  loff_t pos)
+ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
+                                  struct iov_iter *to)
  {
        int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
        struct file *filp = iocb->ki_filp;
        trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
                        filp->f_path.dentry->d_name.len,
-                       filp->f_path.dentry->d_name.name, nr_segs);
+                       filp->f_path.dentry->d_name.name,
+                       to->nr_segs);   /* GRRRRR */
  
  
        if (!inode) {
        }
        ocfs2_inode_unlock(inode, lock_level);
  
-       ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
+       ret = generic_file_read_iter(iocb, to);
        trace_generic_file_aio_read_ret(ret);
  
        /* buffered aio wouldn't have proper lock coverage today */
        BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
  
-       /* see ocfs2_file_aio_write */
+       /* see ocfs2_file_write_iter */
        if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
                rw_level = -1;
                have_alloc_sem = 0;
@@@ -2705,14 -2615,14 +2615,14 @@@ const struct inode_operations ocfs2_spe
   */
  const struct file_operations ocfs2_fops = {
        .llseek         = ocfs2_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
        .mmap           = ocfs2_mmap,
        .fsync          = ocfs2_sync_file,
        .release        = ocfs2_file_release,
        .open           = ocfs2_file_open,
-       .aio_read       = ocfs2_file_aio_read,
-       .aio_write      = ocfs2_file_aio_write,
+       .read_iter      = ocfs2_file_read_iter,
+       .write_iter     = ocfs2_file_write_iter,
        .unlocked_ioctl = ocfs2_ioctl,
  #ifdef CONFIG_COMPAT
        .compat_ioctl   = ocfs2_compat_ioctl,
        .lock           = ocfs2_lock,
        .flock          = ocfs2_flock,
        .splice_read    = ocfs2_file_splice_read,
-       .splice_write   = ocfs2_file_splice_write,
+       .splice_write   = iter_file_splice_write,
        .fallocate      = ocfs2_fallocate,
  };
  
@@@ -2753,21 -2663,21 +2663,21 @@@ const struct file_operations ocfs2_dop
   */
  const struct file_operations ocfs2_fops_no_plocks = {
        .llseek         = ocfs2_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
        .mmap           = ocfs2_mmap,
        .fsync          = ocfs2_sync_file,
        .release        = ocfs2_file_release,
        .open           = ocfs2_file_open,
-       .aio_read       = ocfs2_file_aio_read,
-       .aio_write      = ocfs2_file_aio_write,
+       .read_iter      = ocfs2_file_read_iter,
+       .write_iter     = ocfs2_file_write_iter,
        .unlocked_ioctl = ocfs2_ioctl,
  #ifdef CONFIG_COMPAT
        .compat_ioctl   = ocfs2_compat_ioctl,
  #endif
        .flock          = ocfs2_flock,
        .splice_read    = ocfs2_file_splice_read,
-       .splice_write   = ocfs2_file_splice_write,
+       .splice_write   = iter_file_splice_write,
        .fallocate      = ocfs2_fallocate,
  };
  
diff --combined fs/reiserfs/file.c
index 5f6c32c668b68816584f19c982c4b9a22ded751b,f070cc827456b68cc3dd3a79bab4009a2c5a2924..db9e80ba53a0db5abe4910fa128bab1e6a2ee6ad
  #include <linux/quotaops.h>
  
  /*
 -** We pack the tails of files on file close, not at the time they are written.
 -** This implies an unnecessary copy of the tail and an unnecessary indirect item
 -** insertion/balancing, for files that are written in one write.
 -** It avoids unnecessary tail packings (balances) for files that are written in
 -** multiple writes and are small enough to have tails.
 -**
 -** file_release is called by the VFS layer when the file is closed.  If
 -** this is the last open file descriptor, and the file
 -** small enough to have a tail, and the tail is currently in an
 -** unformatted node, the tail is converted back into a direct item.
 -**
 -** We use reiserfs_truncate_file to pack the tail, since it already has
 -** all the conditions coded.
 -*/
 + * We pack the tails of files on file close, not at the time they are written.
 + * This implies an unnecessary copy of the tail and an unnecessary indirect item
 + * insertion/balancing, for files that are written in one write.
 + * It avoids unnecessary tail packings (balances) for files that are written in
 + * multiple writes and are small enough to have tails.
 + *
 + * file_release is called by the VFS layer when the file is closed.  If
 + * this is the last open file descriptor, and the file
 + * small enough to have a tail, and the tail is currently in an
 + * unformatted node, the tail is converted back into a direct item.
 + *
 + * We use reiserfs_truncate_file to pack the tail, since it already has
 + * all the conditions coded.
 + */
  static int reiserfs_file_release(struct inode *inode, struct file *filp)
  {
  
          if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1))
                return 0;
  
 -      mutex_lock(&(REISERFS_I(inode)->tailpack));
 +      mutex_lock(&REISERFS_I(inode)->tailpack);
  
          if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) {
 -              mutex_unlock(&(REISERFS_I(inode)->tailpack));
 +              mutex_unlock(&REISERFS_I(inode)->tailpack);
                return 0;
        }
  
        if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
             !tail_has_to_be_packed(inode)) &&
            REISERFS_I(inode)->i_prealloc_count <= 0) {
 -              mutex_unlock(&(REISERFS_I(inode)->tailpack));
 +              mutex_unlock(&REISERFS_I(inode)->tailpack);
                return 0;
        }
  
        reiserfs_write_lock(inode->i_sb);
 -      /* freeing preallocation only involves relogging blocks that
 +      /*
 +       * freeing preallocation only involves relogging blocks that
         * are already in the current transaction.  preallocation gets
         * freed at the end of each transaction, so it is impossible for
         * us to log any additional blocks (including quota blocks)
         */
        err = journal_begin(&th, inode->i_sb, 1);
        if (err) {
 -              /* uh oh, we can't allow the inode to go away while there
 +              /*
 +               * uh oh, we can't allow the inode to go away while there
                 * is still preallocation blocks pending.  Try to join the
                 * aborted transaction
                 */
                jbegin_failure = err;
 -              err = journal_join_abort(&th, inode->i_sb, 1);
 +              err = journal_join_abort(&th, inode->i_sb);
  
                if (err) {
 -                      /* hmpf, our choices here aren't good.  We can pin the inode
 -                       * which will disallow unmount from every happening, we can
 -                       * do nothing, which will corrupt random memory on unmount,
 -                       * or we can forcibly remove the file from the preallocation
 -                       * list, which will leak blocks on disk.  Lets pin the inode
 +                      /*
 +                       * hmpf, our choices here aren't good.  We can pin
 +                       * the inode which will disallow unmount from ever
 +                       * happening, we can do nothing, which will corrupt
 +                       * random memory on unmount, or we can forcibly
 +                       * remove the file from the preallocation list, which
 +                       * will leak blocks on disk.  Lets pin the inode
                         * and let the admin know what is going on.
                         */
                        igrab(inode);
@@@ -96,7 -92,7 +96,7 @@@
  #ifdef REISERFS_PREALLOCATE
        reiserfs_discard_prealloc(&th, inode);
  #endif
 -      err = journal_end(&th, inode->i_sb, 1);
 +      err = journal_end(&th);
  
        /* copy back the error code from journal_begin */
        if (!err)
            (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
            tail_has_to_be_packed(inode)) {
  
 -              /* if regular file is released by last holder and it has been
 -                 appended (we append by unformatted node only) or its direct
 -                 item(s) had to be converted, then it may have to be
 -                 indirect2direct converted */
 +              /*
 +               * if regular file is released by last holder and it has been
 +               * appended (we append by unformatted node only) or its direct
 +               * item(s) had to be converted, then it may have to be
 +               * indirect2direct converted
 +               */
                err = reiserfs_truncate_file(inode, 0);
        }
 -      out:
 +out:
        reiserfs_write_unlock(inode->i_sb);
 -      mutex_unlock(&(REISERFS_I(inode)->tailpack));
 +      mutex_unlock(&REISERFS_I(inode)->tailpack);
        return err;
  }
  
  static int reiserfs_file_open(struct inode *inode, struct file *file)
  {
        int err = dquot_file_open(inode, file);
 +
 +      /* somebody might be tailpacking on final close; wait for it */
          if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
 -              /* somebody might be tailpacking on final close; wait for it */
 -              mutex_lock(&(REISERFS_I(inode)->tailpack));
 +              mutex_lock(&REISERFS_I(inode)->tailpack);
                atomic_inc(&REISERFS_I(inode)->openers);
 -              mutex_unlock(&(REISERFS_I(inode)->tailpack));
 +              mutex_unlock(&REISERFS_I(inode)->tailpack);
        }
        return err;
  }
  
  void reiserfs_vfs_truncate_file(struct inode *inode)
  {
 -      mutex_lock(&(REISERFS_I(inode)->tailpack));
 +      mutex_lock(&REISERFS_I(inode)->tailpack);
        reiserfs_truncate_file(inode, 1);
 -      mutex_unlock(&(REISERFS_I(inode)->tailpack));
 +      mutex_unlock(&REISERFS_I(inode)->tailpack);
  }
  
  /* Sync a reiserfs file. */
@@@ -212,11 -205,10 +212,11 @@@ int reiserfs_commit_page(struct inode *
                        set_buffer_uptodate(bh);
                        if (logit) {
                                reiserfs_prepare_for_journal(s, bh, 1);
 -                              journal_mark_dirty(&th, s, bh);
 +                              journal_mark_dirty(&th, bh);
                        } else if (!buffer_dirty(bh)) {
                                mark_buffer_dirty(bh);
 -                              /* do data=ordered on any page past the end
 +                              /*
 +                               * do data=ordered on any page past the end
                                 * of file and any buffer marked BH_New.
                                 */
                                if (reiserfs_data_ordered(inode->i_sb) &&
                }
        }
        if (logit) {
 -              ret = journal_end(&th, s, bh_per_page + 1);
 -            drop_write_lock:
 +              ret = journal_end(&th);
 +drop_write_lock:
                reiserfs_write_unlock(s);
        }
        /*
  }
  
  const struct file_operations reiserfs_file_operations = {
-       .read = do_sync_read,
-       .write = do_sync_write,
+       .read = new_sync_read,
+       .write = new_sync_write,
        .unlocked_ioctl = reiserfs_ioctl,
  #ifdef CONFIG_COMPAT
        .compat_ioctl = reiserfs_compat_ioctl,
        .open = reiserfs_file_open,
        .release = reiserfs_file_release,
        .fsync = reiserfs_sync_file,
-       .aio_read = generic_file_aio_read,
-       .aio_write = generic_file_aio_write,
+       .read_iter = generic_file_read_iter,
+       .write_iter = generic_file_write_iter,
        .splice_read = generic_file_splice_read,
-       .splice_write = generic_file_splice_write,
+       .splice_write = iter_file_splice_write,
        .llseek = generic_file_llseek,
  };
  
diff --combined fs/reiserfs/inode.c
index e3ca04894919c4d0a38f2623676d7ffe1ce6aff3,b8003e8dd1f47bf726d78a1f1a40aba7a56ecc30..63b2b0ec49e6afacd955abf9f172751768ee08ee
@@@ -25,10 -25,7 +25,10 @@@ int reiserfs_commit_write(struct file *
  
  void reiserfs_evict_inode(struct inode *inode)
  {
 -      /* We need blocks for transaction + (user+group) quota update (possibly delete) */
 +      /*
 +       * We need blocks for transaction + (user+group) quota
 +       * update (possibly delete)
 +       */
        int jbegin_count =
            JOURNAL_PER_BALANCE_CNT * 2 +
            2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
        if (inode->i_nlink)
                goto no_delete;
  
 -      /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
 -      if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {  /* also handles bad_inode case */
 +      /*
 +       * The = 0 happens when we abort creating a new inode
 +       * for some reason like lack of space..
 +       * also handles bad_inode case
 +       */
 +      if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {
  
                reiserfs_delete_xattrs(inode);
  
  
                err = reiserfs_delete_object(&th, inode);
  
 -              /* Do quota update inside a transaction for journaled quotas. We must do that
 -               * after delete_object so that quota updates go into the same transaction as
 -               * stat data deletion */
 +              /*
 +               * Do quota update inside a transaction for journaled quotas.
 +               * We must do that after delete_object so that quota updates
 +               * go into the same transaction as stat data deletion
 +               */
                if (!err) {
                        int depth = reiserfs_write_unlock_nested(inode->i_sb);
                        dquot_free_inode(inode);
                        reiserfs_write_lock_nested(inode->i_sb, depth);
                }
  
 -              if (journal_end(&th, inode->i_sb, jbegin_count))
 +              if (journal_end(&th))
                        goto out;
  
 -              /* check return value from reiserfs_delete_object after
 +              /*
 +               * check return value from reiserfs_delete_object after
                 * ending the transaction
                 */
                if (err)
                    goto out;
  
 -              /* all items of file are deleted, so we can remove "save" link */
 -              remove_save_link(inode, 0 /* not truncate */ ); /* we can't do anything
 -                                                               * about an error here */
 +              /*
 +               * all items of file are deleted, so we can remove
 +               * "save" link
 +               * we can't do anything about an error here
 +               */
 +              remove_save_link(inode, 0 /* not truncate */);
  out:
                reiserfs_write_unlock(inode->i_sb);
        } else {
                /* no object items are in the tree */
                ;
        }
 -      clear_inode(inode);     /* note this must go after the journal_end to prevent deadlock */
 +
 +      /* note this must go after the journal_end to prevent deadlock */
 +      clear_inode(inode);
 +
        dquot_drop(inode);
        inode->i_blocks = 0;
        return;
@@@ -119,10 -103,8 +119,10 @@@ static void _make_cpu_key(struct cpu_ke
        key->key_length = length;
  }
  
 -/* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
 -   offset and type of key */
 +/*
 + * take base of inode_key (it comes from inode always) (dirid, objectid)
 + * and version from an inode, set offset and type of key
 + */
  void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
                  int type, int length)
  {
                      length);
  }
  
 -//
 -// when key is 0, do not set version and short key
 -//
 +/* when key is 0, do not set version and short key */
  inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
                              int version,
                              loff_t offset, int type, int length,
        set_le_ih_k_type(ih, type);
        put_ih_item_len(ih, length);
        /*    set_ih_free_space (ih, 0); */
 -      // for directory items it is entry count, for directs and stat
 -      // datas - 0xffff, for indirects - 0
 +      /*
 +       * for directory items it is entry count, for directs and stat
 +       * datas - 0xffff, for indirects - 0
 +       */
        put_ih_entry_count(ih, entry_count);
  }
  
 -//
 -// FIXME: we might cache recently accessed indirect item
 -
 -// Ugh.  Not too eager for that....
 -//  I cut the code until such time as I see a convincing argument (benchmark).
 -// I don't want a bloated inode struct..., and I don't like code complexity....
 -
 -/* cutting the code is fine, since it really isn't in use yet and is easy
 -** to add back in.  But, Vladimir has a really good idea here.  Think
 -** about what happens for reading a file.  For each page,
 -** The VFS layer calls reiserfs_readpage, who searches the tree to find
 -** an indirect item.  This indirect item has X number of pointers, where
 -** X is a big number if we've done the block allocation right.  But,
 -** we only use one or two of these pointers during each call to readpage,
 -** needlessly researching again later on.
 -**
 -** The size of the cache could be dynamic based on the size of the file.
 -**
 -** I'd also like to see us cache the location the stat data item, since
 -** we are needlessly researching for that frequently.
 -**
 -** --chris
 -*/
 +/*
 + * FIXME: we might cache recently accessed indirect item
 + * Ugh.  Not too eager for that....
 + * I cut the code until such time as I see a convincing argument (benchmark).
 + * I don't want a bloated inode struct..., and I don't like code complexity....
 + */
  
 -/* If this page has a file tail in it, and
 -** it was read in by get_block_create_0, the page data is valid,
 -** but tail is still sitting in a direct item, and we can't write to
 -** it.  So, look through this page, and check all the mapped buffers
 -** to make sure they have valid block numbers.  Any that don't need
 -** to be unmapped, so that __block_write_begin will correctly call
 -** reiserfs_get_block to convert the tail into an unformatted node
 -*/
 +/*
 + * cutting the code is fine, since it really isn't in use yet and is easy
 + * to add back in.  But, Vladimir has a really good idea here.  Think
 + * about what happens for reading a file.  For each page,
 + * The VFS layer calls reiserfs_readpage, who searches the tree to find
 + * an indirect item.  This indirect item has X number of pointers, where
 + * X is a big number if we've done the block allocation right.  But,
 + * we only use one or two of these pointers during each call to readpage,
 + * needlessly researching again later on.
 + *
 + * The size of the cache could be dynamic based on the size of the file.
 + *
 + * I'd also like to see us cache the location the stat data item, since
 + * we are needlessly researching for that frequently.
 + *
 + * --chris
 + */
 +
 +/*
 + * If this page has a file tail in it, and
 + * it was read in by get_block_create_0, the page data is valid,
 + * but tail is still sitting in a direct item, and we can't write to
 + * it.  So, look through this page, and check all the mapped buffers
 + * to make sure they have valid block numbers.  Any that don't need
 + * to be unmapped, so that __block_write_begin will correctly call
 + * reiserfs_get_block to convert the tail into an unformatted node
 + */
  static inline void fix_tail_page_for_writing(struct page *page)
  {
        struct buffer_head *head, *next, *bh;
        }
  }
  
 -/* reiserfs_get_block does not need to allocate a block only if it has been
 -   done already or non-hole position has been found in the indirect item */
 +/*
 + * reiserfs_get_block does not need to allocate a block only if it has been
 + * done already or non-hole position has been found in the indirect item
 + */
  static inline int allocation_needed(int retval, b_blocknr_t allocated,
                                    struct item_head *ih,
                                    __le32 * item, int pos_in_item)
@@@ -233,16 -211,14 +233,16 @@@ static inline void set_block_dev_mapped
        map_bh(bh, inode->i_sb, block);
  }
  
 -//
 -// files which were created in the earlier version can not be longer,
 -// than 2 gb
 -//
 +/*
 + * files which were created in the earlier version can not be longer,
 + * than 2 gb
 + */
  static int file_capable(struct inode *inode, sector_t block)
  {
 -      if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||      // it is new file.
 -          block < (1 << (31 - inode->i_sb->s_blocksize_bits)))        // old file, but 'block' is inside of 2gb
 +      /* it is new file. */
 +      if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||
 +          /* old file, but 'block' is inside of 2gb */
 +          block < (1 << (31 - inode->i_sb->s_blocksize_bits)))
                return 1;
  
        return 0;
@@@ -252,6 -228,7 +252,6 @@@ static int restart_transaction(struct r
                               struct inode *inode, struct treepath *path)
  {
        struct super_block *s = th->t_super;
 -      int len = th->t_blocks_allocated;
        int err;
  
        BUG_ON(!th->t_trans_id);
                return 0;
        }
        reiserfs_update_sd(th, inode);
 -      err = journal_end(th, s, len);
 +      err = journal_end(th);
        if (!err) {
                err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
                if (!err)
        return err;
  }
  
 -// it is called by get_block when create == 0. Returns block number
 -// for 'block'-th logical block of file. When it hits direct item it
 -// returns 0 (being called from bmap) or read direct item into piece
 -// of page (bh_result)
 -
 -// Please improve the english/clarity in the comment above, as it is
 -// hard to understand.
 -
 +/*
 + * it is called by get_block when create == 0. Returns block number
 + * for 'block'-th logical block of file. When it hits direct item it
 + * returns 0 (being called from bmap) or read direct item into piece
 + * of page (bh_result)
 + * Please improve the english/clarity in the comment above, as it is
 + * hard to understand.
 + */
  static int _get_block_create_0(struct inode *inode, sector_t block,
                               struct buffer_head *bh_result, int args)
  {
        int done = 0;
        unsigned long offset;
  
 -      // prepare the key to look for the 'block'-th block of file
 +      /* prepare the key to look for the 'block'-th block of file */
        make_cpu_key(&key, inode,
                     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
                     3);
                        kunmap(bh_result->b_page);
                if (result == IO_ERROR)
                        return -EIO;
 -              // We do not return -ENOENT if there is a hole but page is uptodate, because it means
 -              // That there is some MMAPED data associated with it that is yet to be written to disk.
 +              /*
 +               * We do not return -ENOENT if there is a hole but page is
 +               * uptodate, because it means that there is some MMAPED data
 +               * associated with it that is yet to be written to disk.
 +               */
                if ((args & GET_BLOCK_NO_HOLE)
                    && !PageUptodate(bh_result->b_page)) {
                        return -ENOENT;
                }
                return 0;
        }
 -      //
 +
        bh = get_last_bh(&path);
 -      ih = get_ih(&path);
 +      ih = tp_item_head(&path);
        if (is_indirect_le_ih(ih)) {
 -              __le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih);
 +              __le32 *ind_item = (__le32 *) ih_item_body(bh, ih);
  
 -              /* FIXME: here we could cache indirect item or part of it in
 -                 the inode to avoid search_by_key in case of subsequent
 -                 access to file */
 +              /*
 +               * FIXME: here we could cache indirect item or part of it in
 +               * the inode to avoid search_by_key in case of subsequent
 +               * access to file
 +               */
                blocknr = get_block_num(ind_item, path.pos_in_item);
                ret = 0;
                if (blocknr) {
                                set_buffer_boundary(bh_result);
                        }
                } else
 -                      // We do not return -ENOENT if there is a hole but page is uptodate, because it means
 -                      // That there is some MMAPED data associated with it that is yet to  be written to disk.
 +                      /*
 +                       * We do not return -ENOENT if there is a hole but
 +                       * page is uptodate, because it means that there is
 +                       * some MMAPED data associated with it that is
 +                       * yet to be written to disk.
 +                       */
                if ((args & GET_BLOCK_NO_HOLE)
                            && !PageUptodate(bh_result->b_page)) {
                        ret = -ENOENT;
                        kunmap(bh_result->b_page);
                return ret;
        }
 -      // requested data are in direct item(s)
 +      /* requested data are in direct item(s) */
        if (!(args & GET_BLOCK_READ_DIRECT)) {
 -              // we are called by bmap. FIXME: we can not map block of file
 -              // when it is stored in direct item(s)
 +              /*
 +               * we are called by bmap. FIXME: we can not map block of file
 +               * when it is stored in direct item(s)
 +               */
                pathrelse(&path);
                if (p)
                        kunmap(bh_result->b_page);
                return -ENOENT;
        }
  
 -      /* if we've got a direct item, and the buffer or page was uptodate,
 -       ** we don't want to pull data off disk again.  skip to the
 -       ** end, where we map the buffer and return
 +      /*
 +       * if we've got a direct item, and the buffer or page was uptodate,
 +       * we don't want to pull data off disk again.  skip to the
 +       * end, where we map the buffer and return
         */
        if (buffer_uptodate(bh_result)) {
                goto finished;
        } else
                /*
 -               ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
 -               ** pages without any buffers.  If the page is up to date, we don't want
 -               ** read old data off disk.  Set the up to date bit on the buffer instead
 -               ** and jump to the end
 +               * grab_tail_page can trigger calls to reiserfs_get_block on
 +               * up to date pages without any buffers.  If the page is up
 +               * to date, we don't want read old data off disk.  Set the up
 +               * to date bit on the buffer instead and jump to the end
                 */
        if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
                set_buffer_uptodate(bh_result);
                goto finished;
        }
 -      // read file tail into part of page
 +      /* read file tail into part of page */
        offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
        copy_item_head(&tmp_ih, ih);
  
 -      /* we only want to kmap if we are reading the tail into the page.
 -       ** this is not the common case, so we don't kmap until we are
 -       ** sure we need to.  But, this means the item might move if
 -       ** kmap schedules
 +      /*
 +       * we only want to kmap if we are reading the tail into the page.
 +       * this is not the common case, so we don't kmap until we are
 +       * sure we need to.  But, this means the item might move if
 +       * kmap schedules
         */
        if (!p)
                p = (char *)kmap(bh_result->b_page);
                if (!is_direct_le_ih(ih)) {
                        BUG();
                }
 -              /* make sure we don't read more bytes than actually exist in
 -               ** the file.  This can happen in odd cases where i_size isn't
 -               ** correct, and when direct item padding results in a few
 -               ** extra bytes at the end of the direct item
 +              /*
 +               * make sure we don't read more bytes than actually exist in
 +               * the file.  This can happen in odd cases where i_size isn't
 +               * correct, and when direct item padding results in a few
 +               * extra bytes at the end of the direct item
                 */
                if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
                        break;
                } else {
                        chars = ih_item_len(ih) - path.pos_in_item;
                }
 -              memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars);
 +              memcpy(p, ih_item_body(bh, ih) + path.pos_in_item, chars);
  
                if (done)
                        break;
  
                p += chars;
  
 +              /*
 +               * we done, if read direct item is not the last item of
 +               * node FIXME: we could try to check right delimiting key
 +               * to see whether direct item continues in the right
 +               * neighbor or rely on i_size
 +               */
                if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
 -                      // we done, if read direct item is not the last item of
 -                      // node FIXME: we could try to check right delimiting key
 -                      // to see whether direct item continues in the right
 -                      // neighbor or rely on i_size
                        break;
  
 -              // update key to look for the next piece
 +              /* update key to look for the next piece */
                set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
                result = search_for_position_by_key(inode->i_sb, &key, &path);
                if (result != POSITION_FOUND)
 -                      // i/o error most likely
 +                      /* i/o error most likely */
                        break;
                bh = get_last_bh(&path);
 -              ih = get_ih(&path);
 +              ih = tp_item_head(&path);
        } while (1);
  
        flush_dcache_page(bh_result->b_page);
        kunmap(bh_result->b_page);
  
 -      finished:
 +finished:
        pathrelse(&path);
  
        if (result == IO_ERROR)
                return -EIO;
  
 -      /* this buffer has valid data, but isn't valid for io.  mapping it to
 +      /*
 +       * this buffer has valid data, but isn't valid for io.  mapping it to
         * block #0 tells the rest of reiserfs it just has a tail in it
         */
        map_bh(bh_result, inode->i_sb, 0);
        return 0;
  }
  
 -// this is called to create file map. So, _get_block_create_0 will not
 -// read direct item
 +/*
 + * this is called to create file map. So, _get_block_create_0 will not
 + * read direct item
 + */
  static int reiserfs_bmap(struct inode *inode, sector_t block,
                         struct buffer_head *bh_result, int create)
  {
        return 0;
  }
  
 -/* special version of get_block that is only used by grab_tail_page right
 -** now.  It is sent to __block_write_begin, and when you try to get a
 -** block past the end of the file (or a block from a hole) it returns
 -** -ENOENT instead of a valid buffer.  __block_write_begin expects to
 -** be able to do i/o on the buffers returned, unless an error value
 -** is also returned.
 -**
 -** So, this allows __block_write_begin to be used for reading a single block
 -** in a page.  Where it does not produce a valid page for holes, or past the
 -** end of the file.  This turns out to be exactly what we need for reading
 -** tails for conversion.
 -**
 -** The point of the wrapper is forcing a certain value for create, even
 -** though the VFS layer is calling this function with create==1.  If you
 -** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
 -** don't use this function.
 +/*
 + * special version of get_block that is only used by grab_tail_page right
 + * now.  It is sent to __block_write_begin, and when you try to get a
 + * block past the end of the file (or a block from a hole) it returns
 + * -ENOENT instead of a valid buffer.  __block_write_begin expects to
 + * be able to do i/o on the buffers returned, unless an error value
 + * is also returned.
 + *
 + * So, this allows __block_write_begin to be used for reading a single block
 + * in a page.  Where it does not produce a valid page for holes, or past the
 + * end of the file.  This turns out to be exactly what we need for reading
 + * tails for conversion.
 + *
 + * The point of the wrapper is forcing a certain value for create, even
 + * though the VFS layer is calling this function with create==1.  If you
 + * don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
 + * don't use this function.
  */
  static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
                                       struct buffer_head *bh_result,
        return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
  }
  
 -/* This is special helper for reiserfs_get_block in case we are executing
 -   direct_IO request. */
 +/*
 + * This is special helper for reiserfs_get_block in case we are executing
 + * direct_IO request.
 + */
  static int reiserfs_get_blocks_direct_io(struct inode *inode,
                                         sector_t iblock,
                                         struct buffer_head *bh_result,
  
        bh_result->b_page = NULL;
  
 -      /* We set the b_size before reiserfs_get_block call since it is
 -         referenced in convert_tail_for_hole() that may be called from
 -         reiserfs_get_block() */
 +      /*
 +       * We set the b_size before reiserfs_get_block call since it is
 +       * referenced in convert_tail_for_hole() that may be called from
 +       * reiserfs_get_block()
 +       */
        bh_result->b_size = (1 << inode->i_blkbits);
  
        ret = reiserfs_get_block(inode, iblock, bh_result,
  
        /* don't allow direct io onto tail pages */
        if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
 -              /* make sure future calls to the direct io funcs for this offset
 -               ** in the file fail by unmapping the buffer
 +              /*
 +               * make sure future calls to the direct io funcs for this
 +               * offset in the file fail by unmapping the buffer
                 */
                clear_buffer_mapped(bh_result);
                ret = -EINVAL;
        }
 -      /* Possible unpacked tail. Flush the data before pages have
 -         disappeared */
 +
 +      /*
 +       * Possible unpacked tail. Flush the data before pages have
 +       * disappeared
 +       */
        if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
                int err;
  
                if (err < 0)
                        ret = err;
        }
 -      out:
 +out:
        return ret;
  }
  
  /*
 -** helper function for when reiserfs_get_block is called for a hole
 -** but the file tail is still in a direct item
 -** bh_result is the buffer head for the hole
 -** tail_offset is the offset of the start of the tail in the file
 -**
 -** This calls prepare_write, which will start a new transaction
 -** you should not be in a transaction, or have any paths held when you
 -** call this.
 -*/
 + * helper function for when reiserfs_get_block is called for a hole
 + * but the file tail is still in a direct item
 + * bh_result is the buffer head for the hole
 + * tail_offset is the offset of the start of the tail in the file
 + *
 + * This calls prepare_write, which will start a new transaction
 + * you should not be in a transaction, or have any paths held when you
 + * call this.
 + */
  static int convert_tail_for_hole(struct inode *inode,
                                 struct buffer_head *bh_result,
                                 loff_t tail_offset)
        tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
  
        index = tail_offset >> PAGE_CACHE_SHIFT;
 -      /* hole_page can be zero in case of direct_io, we are sure
 -         that we cannot get here if we write with O_DIRECT into
 -         tail page */
 +      /*
 +       * hole_page can be zero in case of direct_io, we are sure
 +       * that we cannot get here if we write with O_DIRECT into tail page
 +       */
        if (!hole_page || index != hole_page->index) {
                tail_page = grab_cache_page(inode->i_mapping, index);
                retval = -ENOMEM;
                tail_page = hole_page;
        }
  
 -      /* we don't have to make sure the conversion did not happen while
 -       ** we were locking the page because anyone that could convert
 -       ** must first take i_mutex.
 -       **
 -       ** We must fix the tail page for writing because it might have buffers
 -       ** that are mapped, but have a block number of 0.  This indicates tail
 -       ** data that has been read directly into the page, and
 -       ** __block_write_begin won't trigger a get_block in this case.
 +      /*
 +       * we don't have to make sure the conversion did not happen while
 +       * we were locking the page because anyone that could convert
 +       * must first take i_mutex.
 +       *
 +       * We must fix the tail page for writing because it might have buffers
 +       * that are mapped, but have a block number of 0.  This indicates tail
 +       * data that has been read directly into the page, and
 +       * __block_write_begin won't trigger a get_block in this case.
         */
        fix_tail_page_for_writing(tail_page);
        retval = __reiserfs_write_begin(tail_page, tail_start,
  
        retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
  
 -      unlock:
 +unlock:
        if (tail_page != hole_page) {
                unlock_page(tail_page);
                page_cache_release(tail_page);
        }
 -      out:
 +out:
        return retval;
  }
  
@@@ -657,8 -604,7 +657,8 @@@ int reiserfs_get_block(struct inode *in
                       struct buffer_head *bh_result, int create)
  {
        int repeat, retval = 0;
 -      b_blocknr_t allocated_block_nr = 0;     // b_blocknr_t is (unsigned) 32 bit int
 +      /* b_blocknr_t is (unsigned) 32 bit int*/
 +      b_blocknr_t allocated_block_nr = 0;
        INITIALIZE_PATH(path);
        int pos_in_item;
        struct cpu_key key;
        int done;
        int fs_gen;
        struct reiserfs_transaction_handle *th = NULL;
 -      /* space reserved in transaction batch:
 -         . 3 balancings in direct->indirect conversion
 -         . 1 block involved into reiserfs_update_sd()
 -         XXX in practically impossible worst case direct2indirect()
 -         can incur (much) more than 3 balancings.
 -         quota update for user, group */
 +      /*
 +       * space reserved in transaction batch:
 +       * . 3 balancings in direct->indirect conversion
 +       * . 1 block involved into reiserfs_update_sd()
 +       * XXX in practically impossible worst case direct2indirect()
 +       * can incur (much) more than 3 balancings.
 +       * quota update for user, group
 +       */
        int jbegin_count =
            JOURNAL_PER_BALANCE_CNT * 3 + 1 +
            2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
                return -EFBIG;
        }
  
 -      /* if !create, we aren't changing the FS, so we don't need to
 -       ** log anything, so we don't need to start a transaction
 +      /*
 +       * if !create, we aren't changing the FS, so we don't need to
 +       * log anything, so we don't need to start a transaction
         */
        if (!(create & GET_BLOCK_CREATE)) {
                int ret;
                reiserfs_write_unlock(inode->i_sb);
                return ret;
        }
 +
        /*
         * if we're already in a transaction, make sure to close
         * any new transactions we start in this func
            reiserfs_transaction_running(inode->i_sb))
                dangle = 0;
  
 -      /* If file is of such a size, that it might have a tail and tails are enabled
 -       ** we should mark it as possibly needing tail packing on close
 +      /*
 +       * If file is of such a size, that it might have a tail and
 +       * tails are enabled  we should mark it as possibly needing
 +       * tail packing on close
         */
        if ((have_large_tails(inode->i_sb)
             && inode->i_size < i_block_size(inode) * 4)
        /* set the key of the first byte in the 'block'-th block of file */
        make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
        if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
 -            start_trans:
 +start_trans:
                th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
                if (!th) {
                        retval = -ENOMEM;
                }
                reiserfs_update_inode_transaction(inode);
        }
 -      research:
 +research:
  
        retval = search_for_position_by_key(inode->i_sb, &key, &path);
        if (retval == IO_ERROR) {
        }
  
        bh = get_last_bh(&path);
 -      ih = get_ih(&path);
 -      item = get_item(&path);
 +      ih = tp_item_head(&path);
 +      item = tp_item_body(&path);
        pos_in_item = path.pos_in_item;
  
        fs_gen = get_generation(inode->i_sb);
                    _allocate_block(th, block, inode, &allocated_block_nr,
                                    &path, create);
  
 +              /*
 +               * restart the transaction to give the journal a chance to free
 +               * some blocks.  releases the path, so we have to go back to
 +               * research if we succeed on the second try
 +               */
                if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
 -                      /* restart the transaction to give the journal a chance to free
 -                       ** some blocks.  releases the path, so we have to go back to
 -                       ** research if we succeed on the second try
 -                       */
                        SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
                        retval = restart_transaction(th, inode, &path);
                        if (retval)
  
        if (indirect_item_found(retval, ih)) {
                b_blocknr_t unfm_ptr;
 -              /* 'block'-th block is in the file already (there is
 -                 corresponding cell in some indirect item). But it may be
 -                 zero unformatted node pointer (hole) */
 +              /*
 +               * 'block'-th block is in the file already (there is
 +               * corresponding cell in some indirect item). But it may be
 +               * zero unformatted node pointer (hole)
 +               */
                unfm_ptr = get_block_num(item, pos_in_item);
                if (unfm_ptr == 0) {
                        /* use allocated block to plug the hole */
                                reiserfs_add_ordered_list(inode, bh_result);
                        put_block_num(item, pos_in_item, allocated_block_nr);
                        unfm_ptr = allocated_block_nr;
 -                      journal_mark_dirty(th, inode->i_sb, bh);
 +                      journal_mark_dirty(th, bh);
                        reiserfs_update_sd(th, inode);
                }
                set_block_dev_mapped(bh_result, unfm_ptr, inode);
  
                reiserfs_write_unlock(inode->i_sb);
  
 -              /* the item was found, so new blocks were not added to the file
 -               ** there is no need to make sure the inode is updated with this
 -               ** transaction
 +              /*
 +               * the item was found, so new blocks were not added to the file
 +               * there is no need to make sure the inode is updated with this
 +               * transaction
                 */
                return retval;
        }
                goto start_trans;
        }
  
 -      /* desired position is not found or is in the direct item. We have
 -         to append file with holes up to 'block'-th block converting
 -         direct items to indirect one if necessary */
 +      /*
 +       * desired position is not found or is in the direct item. We have
 +       * to append file with holes up to 'block'-th block converting
 +       * direct items to indirect one if necessary
 +       */
        done = 0;
        do {
                if (is_statdata_le_ih(ih)) {
                                          TYPE_INDIRECT, UNFM_P_SIZE,
                                          0 /* free_space */ );
  
 +                      /*
 +                       * we are going to add 'block'-th block to the file.
 +                       * Use allocated block for that
 +                       */
                        if (cpu_key_k_offset(&key) == 1) {
 -                              /* we are going to add 'block'-th block to the file. Use
 -                                 allocated block for that */
                                unp = cpu_to_le32(allocated_block_nr);
                                set_block_dev_mapped(bh_result,
                                                     allocated_block_nr, inode);
                                set_buffer_new(bh_result);
                                done = 1;
                        }
 -                      tmp_key = key;  // ;)
 +                      tmp_key = key;  /* ;) */
                        set_cpu_key_k_offset(&tmp_key, 1);
                        PATH_LAST_POSITION(&path)++;
  
                        if (retval) {
                                reiserfs_free_block(th, inode,
                                                    allocated_block_nr, 1);
 -                              goto failure;   // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
 +                              /*
 +                               * retval == -ENOSPC, -EDQUOT or -EIO
 +                               * or -EEXIST
 +                               */
 +                              goto failure;
                        }
 -                      //mark_tail_converted (inode);
                } else if (is_direct_le_ih(ih)) {
                        /* direct item has to be converted */
                        loff_t tail_offset;
                        tail_offset =
                            ((le_ih_k_offset(ih) -
                              1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
 +
 +                      /*
 +                       * direct item we just found fits into block we have
 +                       * to map. Convert it into unformatted node: use
 +                       * bh_result for the conversion
 +                       */
                        if (tail_offset == cpu_key_k_offset(&key)) {
 -                              /* direct item we just found fits into block we have
 -                                 to map. Convert it into unformatted node: use
 -                                 bh_result for the conversion */
                                set_block_dev_mapped(bh_result,
                                                     allocated_block_nr, inode);
                                unbh = bh_result;
                                done = 1;
                        } else {
 -                              /* we have to padd file tail stored in direct item(s)
 -                                 up to block size and convert it to unformatted
 -                                 node. FIXME: this should also get into page cache */
 +                              /*
 +                               * we have to pad file tail stored in direct
 +                               * item(s) up to block size and convert it
 +                               * to unformatted node. FIXME: this should
 +                               * also get into page cache
 +                               */
  
                                pathrelse(&path);
                                /*
                                                        inode->i_ino,
                                                        retval);
                                        if (allocated_block_nr) {
 -                                              /* the bitmap, the super, and the stat data == 3 */
 +                                              /*
 +                                               * the bitmap, the super,
 +                                               * and the stat data == 3
 +                                               */
                                                if (!th)
                                                        th = reiserfs_persistent_transaction(inode->i_sb, 3);
                                                if (th)
                                                    allocated_block_nr, 1);
                                goto failure;
                        }
 -                      /* it is important the set_buffer_uptodate is done after
 -                       ** the direct2indirect.  The buffer might contain valid
 -                       ** data newer than the data on disk (read by readpage, changed,
 -                       ** and then sent here by writepage).  direct2indirect needs
 -                       ** to know if unbh was already up to date, so it can decide
 -                       ** if the data in unbh needs to be replaced with data from
 -                       ** the disk
 +                      /*
 +                       * it is important the set_buffer_uptodate is done
 +                       * after the direct2indirect.  The buffer might
 +                       * contain valid data newer than the data on disk
 +                       * (read by readpage, changed, and then sent here by
 +                       * writepage).  direct2indirect needs to know if unbh
 +                       * was already up to date, so it can decide if the
 +                       * data in unbh needs to be replaced with data from
 +                       * the disk
                         */
                        set_buffer_uptodate(unbh);
  
 -                      /* unbh->b_page == NULL in case of DIRECT_IO request, this means
 -                         buffer will disappear shortly, so it should not be added to
 +                      /*
 +                       * unbh->b_page == NULL in case of DIRECT_IO request,
 +                       * this means buffer will disappear shortly, so it
 +                       * should not be added to
                         */
                        if (unbh->b_page) {
 -                              /* we've converted the tail, so we must
 -                               ** flush unbh before the transaction commits
 +                              /*
 +                               * we've converted the tail, so we must
 +                               * flush unbh before the transaction commits
                                 */
                                reiserfs_add_tail_list(inode, unbh);
  
 -                              /* mark it dirty now to prevent commit_write from adding
 -                               ** this buffer to the inode's dirty buffer list
 +                              /*
 +                               * mark it dirty now to prevent commit_write
 +                               * from adding this buffer to the inode's
 +                               * dirty buffer list
                                 */
                                /*
 -                               * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
 -                               * It's still atomic, but it sets the page dirty too,
 -                               * which makes it eligible for writeback at any time by the
 -                               * VM (which was also the case with __mark_buffer_dirty())
 +                               * AKPM: changed __mark_buffer_dirty to
 +                               * mark_buffer_dirty().  It's still atomic,
 +                               * but it sets the page dirty too, which makes
 +                               * it eligible for writeback at any time by the
 +                               * VM (which was also the case with
 +                               * __mark_buffer_dirty())
                                 */
                                mark_buffer_dirty(unbh);
                        }
                } else {
 -                      /* append indirect item with holes if needed, when appending
 -                         pointer to 'block'-th block use block, which is already
 -                         allocated */
 +                      /*
 +                       * append indirect item with holes if needed, when
 +                       * appending pointer to 'block'-th block use block,
 +                       * which is already allocated
 +                       */
                        struct cpu_key tmp_key;
 -                      unp_t unf_single = 0;   // We use this in case we need to allocate only
 -                      // one block which is a fastpath
 +                      /*
 +                       * We use this in case we need to allocate
 +                       * only one block which is a fastpath
 +                       */
 +                      unp_t unf_single = 0;
                        unp_t *un;
                        __u64 max_to_insert =
                            MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
  
                        RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
                               "vs-804: invalid position for append");
 -                      /* indirect item has to be appended, set up key of that position */
 +                      /*
 +                       * indirect item has to be appended,
 +                       * set up key of that position
 +                       * (key type is unimportant)
 +                       */
                        make_cpu_key(&tmp_key, inode,
                                     le_key_k_offset(version,
 -                                                   &(ih->ih_key)) +
 +                                                   &ih->ih_key) +
                                     op_bytes_number(ih,
                                                     inode->i_sb->s_blocksize),
 -                                   //pos_in_item * inode->i_sb->s_blocksize,
 -                                   TYPE_INDIRECT, 3); // key type is unimportant
 +                                   TYPE_INDIRECT, 3);
  
                        RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
                               "green-805: invalid offset");
                                }
                        }
                        if (blocks_needed <= max_to_insert) {
 -                              /* we are going to add target block to the file. Use allocated
 -                                 block for that */
 +                              /*
 +                               * we are going to add target block to
 +                               * the file. Use allocated block for that
 +                               */
                                un[blocks_needed - 1] =
                                    cpu_to_le32(allocated_block_nr);
                                set_block_dev_mapped(bh_result,
                                done = 1;
                        } else {
                                /* paste hole to the indirect item */
 -                              /* If kmalloc failed, max_to_insert becomes zero and it means we
 -                                 only have space for one block */
 +                              /*
 +                               * If kmalloc failed, max_to_insert becomes
 +                               * zero and it means we only have space for
 +                               * one block
 +                               */
                                blocks_needed =
                                    max_to_insert ? max_to_insert : 1;
                        }
                                goto failure;
                        }
                        if (!done) {
 -                              /* We need to mark new file size in case this function will be
 -                                 interrupted/aborted later on. And we may do this only for
 -                                 holes. */
 +                              /*
 +                               * We need to mark new file size in case
 +                               * this function will be interrupted/aborted
 +                               * later on. And we may do this only for
 +                               * holes.
 +                               */
                                inode->i_size +=
                                    inode->i_sb->s_blocksize * blocks_needed;
                        }
                if (done == 1)
                        break;
  
 -              /* this loop could log more blocks than we had originally asked
 -               ** for.  So, we have to allow the transaction to end if it is
 -               ** too big or too full.  Update the inode so things are
 -               ** consistent if we crash before the function returns
 -               **
 -               ** release the path so that anybody waiting on the path before
 -               ** ending their transaction will be able to continue.
 +              /*
 +               * this loop could log more blocks than we had originally
 +               * asked for.  So, we have to allow the transaction to end
 +               * if it is too big or too full.  Update the inode so things
 +               * are consistent if we crash before the function returns
 +               * release the path so that anybody waiting on the path before
 +               * ending their transaction will be able to continue.
                 */
                if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
                        retval = restart_transaction(th, inode, &path);
                        goto failure;
                }
                bh = get_last_bh(&path);
 -              ih = get_ih(&path);
 -              item = get_item(&path);
 +              ih = tp_item_head(&path);
 +              item = tp_item_body(&path);
                pos_in_item = path.pos_in_item;
        } while (1);
  
        retval = 0;
  
 -      failure:
 +failure:
        if (th && (!dangle || (retval && !th->t_trans_id))) {
                int err;
                if (th->t_trans_id)
@@@ -1165,10 -1060,8 +1165,10 @@@ reiserfs_readpages(struct file *file, s
        return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
  }
  
 -/* Compute real number of used bytes by file
 - * Following three functions can go away when we'll have enough space in stat item
 +/*
 + * Compute real number of used bytes by file
 + * Following three functions can go away when we'll have enough space in
 + * stat item
   */
  static int real_space_diff(struct inode *inode, int sd_size)
  {
        if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
                return sd_size;
  
 -      /* End of file is also in full block with indirect reference, so round
 -       ** up to the next block.
 -       **
 -       ** there is just no way to know if the tail is actually packed
 -       ** on the file, so we have to assume it isn't.  When we pack the
 -       ** tail, we add 4 bytes to pretend there really is an unformatted
 -       ** node pointer
 +      /*
 +       * End of file is also in full block with indirect reference, so round
 +       * up to the next block.
 +       *
 +       * there is just no way to know if the tail is actually packed
 +       * on the file, so we have to assume it isn't.  When we pack the
 +       * tail, we add 4 bytes to pretend there really is an unformatted
 +       * node pointer
         */
        bytes =
            ((inode->i_size +
@@@ -1216,36 -1108,36 +1216,36 @@@ static inline ulong to_fake_used_blocks
                bytes += (loff_t) 511;
        }
  
 -      /* files from before the quota patch might i_blocks such that
 -       ** bytes < real_space.  Deal with that here to prevent it from
 -       ** going negative.
 +      /*
 +       * files from before the quota patch might i_blocks such that
 +       * bytes < real_space.  Deal with that here to prevent it from
 +       * going negative.
         */
        if (bytes < real_space)
                return 0;
        return (bytes - real_space) >> 9;
  }
  
 -//
 -// BAD: new directories have stat data of new type and all other items
 -// of old type. Version stored in the inode says about body items, so
 -// in update_stat_data we can not rely on inode, but have to check
 -// item version directly
 -//
 +/*
 + * BAD: new directories have stat data of new type and all other items
 + * of old type. Version stored in the inode says about body items, so
 + * in update_stat_data we can not rely on inode, but have to check
 + * item version directly
 + */
  
 -// called by read_locked_inode
 +/* called by read_locked_inode */
  static void init_inode(struct inode *inode, struct treepath *path)
  {
        struct buffer_head *bh;
        struct item_head *ih;
        __u32 rdev;
 -      //int version = ITEM_VERSION_1;
  
        bh = PATH_PLAST_BUFFER(path);
 -      ih = PATH_PITEM_HEAD(path);
 +      ih = tp_item_head(path);
  
 -      copy_key(INODE_PKEY(inode), &(ih->ih_key));
 +      copy_key(INODE_PKEY(inode), &ih->ih_key);
  
 -      INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
 +      INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
        REISERFS_I(inode)->i_flags = 0;
        REISERFS_I(inode)->i_prealloc_block = 0;
        REISERFS_I(inode)->i_prealloc_count = 0;
  
        if (stat_data_v1(ih)) {
                struct stat_data_v1 *sd =
 -                  (struct stat_data_v1 *)B_I_PITEM(bh, ih);
 +                  (struct stat_data_v1 *)ih_item_body(bh, ih);
                unsigned long blocks;
  
                set_inode_item_key_version(inode, KEY_FORMAT_3_5);
                inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
                blocks = (inode->i_size + 511) >> 9;
                blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
 +
 +              /*
 +               * there was a bug in <=3.5.23 when i_blocks could take
 +               * negative values. Starting from 3.5.17 this value could
 +               * even be stored in stat data. For such files we set
 +               * i_blocks based on file size. Just 2 notes: this can be
 +               * wrong for sparse files. On-disk value will be only
 +               * updated if file's inode will ever change
 +               */
                if (inode->i_blocks > blocks) {
 -                      // there was a bug in <=3.5.23 when i_blocks could take negative
 -                      // values. Starting from 3.5.17 this value could even be stored in
 -                      // stat data. For such files we set i_blocks based on file
 -                      // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
 -                      // only updated if file's inode will ever change
                        inode->i_blocks = blocks;
                }
  
                rdev = sd_v1_rdev(sd);
                REISERFS_I(inode)->i_first_direct_byte =
                    sd_v1_first_direct_byte(sd);
 -              /* an early bug in the quota code can give us an odd number for the
 -               ** block count.  This is incorrect, fix it here.
 +
 +              /*
 +               * an early bug in the quota code can give us an odd
 +               * number for the block count.  This is incorrect, fix it here.
                 */
                if (inode->i_blocks & 1) {
                        inode->i_blocks++;
                inode_set_bytes(inode,
                                to_real_used_space(inode, inode->i_blocks,
                                                   SD_V1_SIZE));
 -              /* nopack is initially zero for v1 objects. For v2 objects,
 -                 nopack is initialised from sd_attrs */
 +              /*
 +               * nopack is initially zero for v1 objects. For v2 objects,
 +               * nopack is initialised from sd_attrs
 +               */
                REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
        } else {
 -              // new stat data found, but object may have old items
 -              // (directories and symlinks)
 -              struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
 +              /*
 +               * new stat data found, but object may have old items
 +               * (directories and symlinks)
 +               */
 +              struct stat_data *sd = (struct stat_data *)ih_item_body(bh, ih);
  
                inode->i_mode = sd_v2_mode(sd);
                set_nlink(inode, sd_v2_nlink(sd));
                inode_set_bytes(inode,
                                to_real_used_space(inode, inode->i_blocks,
                                                   SD_V2_SIZE));
 -              /* read persistent inode attributes from sd and initialise
 -                 generic inode flags from them */
 +              /*
 +               * read persistent inode attributes from sd and initialise
 +               * generic inode flags from them
 +               */
                REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
                sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
        }
        }
  }
  
 -// update new stat data with inode fields
 +/* update new stat data with inode fields */
  static void inode2sd(void *sd, struct inode *inode, loff_t size)
  {
        struct stat_data *sd_v2 = (struct stat_data *)sd;
        set_sd_v2_attrs(sd_v2, flags);
  }
  
 -// used to copy inode's fields to old stat data
 +/* used to copy inode's fields to old stat data */
  static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
  {
        struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
        else
                set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
  
 -      // Sigh. i_first_direct_byte is back
 +      /* Sigh. i_first_direct_byte is back */
        set_sd_v1_first_direct_byte(sd_v1,
                                    REISERFS_I(inode)->i_first_direct_byte);
  }
  
 -/* NOTE, you must prepare the buffer head before sending it here,
 -** and then log it after the call
 -*/
 +/*
 + * NOTE, you must prepare the buffer head before sending it here,
 + * and then log it after the call
 + */
  static void update_stat_data(struct treepath *path, struct inode *inode,
                             loff_t size)
  {
        struct item_head *ih;
  
        bh = PATH_PLAST_BUFFER(path);
 -      ih = PATH_PITEM_HEAD(path);
 +      ih = tp_item_head(path);
  
        if (!is_statdata_le_ih(ih))
                reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
                               INODE_PKEY(inode), ih);
  
 +      /* path points to old stat data */
        if (stat_data_v1(ih)) {
 -              // path points to old stat data
 -              inode2sd_v1(B_I_PITEM(bh, ih), inode, size);
 +              inode2sd_v1(ih_item_body(bh, ih), inode, size);
        } else {
 -              inode2sd(B_I_PITEM(bh, ih), inode, size);
 +              inode2sd(ih_item_body(bh, ih), inode, size);
        }
  
        return;
@@@ -1456,8 -1335,7 +1456,8 @@@ void reiserfs_update_sd_size(struct rei
  
        BUG_ON(!th->t_trans_id);
  
 -      make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);        //key type is unimportant
 +      /* key type is unimportant */
 +      make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);
  
        for (;;) {
                int pos;
                        return;
                }
  
 -              /* sigh, prepare_for_journal might schedule.  When it schedules the
 -               ** FS might change.  We have to detect that, and loop back to the
 -               ** search if the stat data item has moved
 +              /*
 +               * sigh, prepare_for_journal might schedule.  When it
 +               * schedules the FS might change.  We have to detect that,
 +               * and loop back to the search if the stat data item has moved
                 */
                bh = get_last_bh(&path);
 -              ih = get_ih(&path);
 +              ih = tp_item_head(&path);
                copy_item_head(&tmp_ih, ih);
                fs_gen = get_generation(inode->i_sb);
                reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
 +
 +              /* Stat_data item has been moved after scheduling. */
                if (fs_changed(fs_gen, inode->i_sb)
                    && item_moved(&tmp_ih, &path)) {
                        reiserfs_restore_prepared_buffer(inode->i_sb, bh);
 -                      continue;       /* Stat_data item has been moved after scheduling. */
 +                      continue;
                }
                break;
        }
        update_stat_data(&path, inode, size);
 -      journal_mark_dirty(th, th->t_super, bh);
 +      journal_mark_dirty(th, bh);
        pathrelse(&path);
        return;
  }
  
 -/* reiserfs_read_locked_inode is called to read the inode off disk, and it
 -** does a make_bad_inode when things go wrong.  But, we need to make sure
 -** and clear the key in the private portion of the inode, otherwise a
 -** corresponding iput might try to delete whatever object the inode last
 -** represented.
 -*/
 +/*
 + * reiserfs_read_locked_inode is called to read the inode off disk, and it
 + * does a make_bad_inode when things go wrong.  But, we need to make sure
 + * and clear the key in the private portion of the inode, otherwise a
 + * corresponding iput might try to delete whatever object the inode last
 + * represented.
 + */
  static void reiserfs_make_bad_inode(struct inode *inode)
  {
        memset(INODE_PKEY(inode), 0, KEY_SIZE);
        make_bad_inode(inode);
  }
  
 -//
 -// initially this function was derived from minix or ext2's analog and
 -// evolved as the prototype did
 -//
 -
 +/*
 + * initially this function was derived from minix or ext2's analog and
 + * evolved as the prototype did
 + */
  int reiserfs_init_locked_inode(struct inode *inode, void *p)
  {
        struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
        return 0;
  }
  
 -/* looks for stat data in the tree, and fills up the fields of in-core
 -   inode stat data fields */
 +/*
 + * looks for stat data in the tree, and fills up the fields of in-core
 + * inode stat data fields
 + */
  void reiserfs_read_locked_inode(struct inode *inode,
                                struct reiserfs_iget_args *args)
  {
  
        dirino = args->dirid;
  
 -      /* set version 1, version 2 could be used too, because stat data
 -         key is the same in both versions */
 +      /*
 +       * set version 1, version 2 could be used too, because stat data
 +       * key is the same in both versions
 +       */
        key.version = KEY_FORMAT_3_5;
        key.on_disk_key.k_dir_id = dirino;
        key.on_disk_key.k_objectid = inode->i_ino;
                reiserfs_make_bad_inode(inode);
                return;
        }
 +
 +      /* a stale NFS handle can trigger this without it being an error */
        if (retval != ITEM_FOUND) {
 -              /* a stale NFS handle can trigger this without it being an error */
                pathrelse(&path_to_sd);
                reiserfs_make_bad_inode(inode);
                clear_nlink(inode);
  
        init_inode(inode, &path_to_sd);
  
 -      /* It is possible that knfsd is trying to access inode of a file
 -         that is being removed from the disk by some other thread. As we
 -         update sd on unlink all that is required is to check for nlink
 -         here. This bug was first found by Sizif when debugging
 -         SquidNG/Butterfly, forgotten, and found again after Philippe
 -         Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
 -
 -         More logical fix would require changes in fs/inode.c:iput() to
 -         remove inode from hash-table _after_ fs cleaned disk stuff up and
 -         in iget() to return NULL if I_FREEING inode is found in
 -         hash-table. */
 -      /* Currently there is one place where it's ok to meet inode with
 -         nlink==0: processing of open-unlinked and half-truncated files
 -         during mount (fs/reiserfs/super.c:finish_unfinished()). */
 +      /*
 +       * It is possible that knfsd is trying to access inode of a file
 +       * that is being removed from the disk by some other thread. As we
 +       * update sd on unlink all that is required is to check for nlink
 +       * here. This bug was first found by Sizif when debugging
 +       * SquidNG/Butterfly, forgotten, and found again after Philippe
 +       * Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
 +
 +       * More logical fix would require changes in fs/inode.c:iput() to
 +       * remove inode from hash-table _after_ fs cleaned disk stuff up and
 +       * in iget() to return NULL if I_FREEING inode is found in
 +       * hash-table.
 +       */
 +
 +      /*
 +       * Currently there is one place where it's ok to meet inode with
 +       * nlink==0: processing of open-unlinked and half-truncated files
 +       * during mount (fs/reiserfs/super.c:finish_unfinished()).
 +       */
        if ((inode->i_nlink == 0) &&
            !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
                reiserfs_warning(inode->i_sb, "vs-13075",
                reiserfs_make_bad_inode(inode);
        }
  
 -      reiserfs_check_path(&path_to_sd);       /* init inode should be relsing */
 +      /* init inode should be relsing */
 +      reiserfs_check_path(&path_to_sd);
  
        /*
         * Stat data v1 doesn't support ACLs.
                cache_no_acl(inode);
  }
  
 -/**
 +/*
   * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
   *
   * @inode:    inode from hash table to check
@@@ -1692,8 -1556,7 +1692,8 @@@ static struct dentry *reiserfs_get_dent
  struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
                int fh_len, int fh_type)
  {
 -      /* fhtype happens to reflect the number of u32s encoded.
 +      /*
 +       * fhtype happens to reflect the number of u32s encoded.
         * due to a bug in earlier code, fhtype might indicate there
         * are more u32s then actually fitted.
         * so if fhtype seems to be more than len, reduce fhtype.
@@@ -1762,16 -1625,13 +1762,16 @@@ int reiserfs_encode_fh(struct inode *in
        return *lenp;
  }
  
 -/* looks for stat data, then copies fields to it, marks the buffer
 -   containing stat data as dirty */
 -/* reiserfs inodes are never really dirty, since the dirty inode call
 -** always logs them.  This call allows the VFS inode marking routines
 -** to properly mark inodes for datasync and such, but only actually
 -** does something when called for a synchronous update.
 -*/
 +/*
 + * looks for stat data, then copies fields to it, marks the buffer
 + * containing stat data as dirty
 + */
 +/*
 + * reiserfs inodes are never really dirty, since the dirty inode call
 + * always logs them.  This call allows the VFS inode marking routines
 + * to properly mark inodes for datasync and such, but only actually
 + * does something when called for a synchronous update.
 + */
  int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
  {
        struct reiserfs_transaction_handle th;
  
        if (inode->i_sb->s_flags & MS_RDONLY)
                return -EROFS;
 -      /* memory pressure can sometimes initiate write_inode calls with sync == 1,
 -       ** these cases are just when the system needs ram, not when the
 -       ** inode needs to reach disk for safety, and they can safely be
 -       ** ignored because the altered inode has already been logged.
 +      /*
 +       * memory pressure can sometimes initiate write_inode calls with
 +       * sync == 1,
 +       * these cases are just when the system needs ram, not when the
 +       * inode needs to reach disk for safety, and they can safely be
 +       * ignored because the altered inode has already been logged.
         */
        if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
                reiserfs_write_lock(inode->i_sb);
                if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
                        reiserfs_update_sd(&th, inode);
 -                      journal_end_sync(&th, inode->i_sb, jbegin_count);
 +                      journal_end_sync(&th);
                }
                reiserfs_write_unlock(inode->i_sb);
        }
        return 0;
  }
  
 -/* stat data of new object is inserted already, this inserts the item
 -   containing "." and ".." entries */
 +/*
 + * stat data of new object is inserted already, this inserts the item
 + * containing "." and ".." entries
 + */
  static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
                                  struct inode *inode,
                                  struct item_head *ih, struct treepath *path,
                      le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
                      TYPE_DIRENTRY, 3 /*key length */ );
  
 -      /* compose item head for new item. Directories consist of items of
 -         old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
 -         is done by reiserfs_new_inode */
 +      /*
 +       * compose item head for new item. Directories consist of items of
 +       * old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
 +       * is done by reiserfs_new_inode
 +       */
        if (old_format_only(sb)) {
                make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
                                  TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
        return reiserfs_insert_item(th, path, &key, ih, inode, body);
  }
  
 -/* stat data of object has been inserted, this inserts the item
 -   containing the body of symlink */
 -static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode,  /* Inode of symlink */
 +/*
 + * stat data of object has been inserted, this inserts the item
 + * containing the body of symlink
 + */
 +static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th,
 +                              struct inode *inode,
                                struct item_head *ih,
                                struct treepath *path, const char *symname,
                                int item_len)
        return reiserfs_insert_item(th, path, &key, ih, inode, symname);
  }
  
 -/* inserts the stat data into the tree, and then calls
 -   reiserfs_new_directory (to insert ".", ".." item if new object is
 -   directory) or reiserfs_new_symlink (to insert symlink body if new
 -   object is symlink) or nothing (if new object is regular file)
 -
 -   NOTE! uid and gid must already be set in the inode.  If we return
 -   non-zero due to an error, we have to drop the quota previously allocated
 -   for the fresh inode.  This can only be done outside a transaction, so
 -   if we return non-zero, we also end the transaction.  */
 +/*
 + * inserts the stat data into the tree, and then calls
 + * reiserfs_new_directory (to insert ".", ".." item if new object is
 + * directory) or reiserfs_new_symlink (to insert symlink body if new
 + * object is symlink) or nothing (if new object is regular file)
 +
 + * NOTE! uid and gid must already be set in the inode.  If we return
 + * non-zero due to an error, we have to drop the quota previously allocated
 + * for the fresh inode.  This can only be done outside a transaction, so
 + * if we return non-zero, we also end the transaction.
 + *
 + * @th: active transaction handle
 + * @dir: parent directory for new inode
 + * @mode: mode of new inode
 + * @symname: symlink contents if inode is symlink
 + * @isize: 0 for regular file, EMPTY_DIR_SIZE for dirs, strlen(symname) for
 + *         symlinks
 + * @inode: inode to be filled
 + * @security: optional security context to associate with this inode
 + */
  int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
                       struct inode *dir, umode_t mode, const char *symname,
                       /* 0 for regular, EMTRY_DIR_SIZE for dirs,
        else
                make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
                                  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
 -      memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
 +      memcpy(INODE_PKEY(inode), &ih.ih_key, KEY_SIZE);
        args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
  
        depth = reiserfs_write_unlock_nested(inode->i_sb);
        }
  
        if (old_format_only(sb))
 -              /* not a perfect generation count, as object ids can be reused, but
 -               ** this is as good as reiserfs can do right now.
 -               ** note that the private part of inode isn't filled in yet, we have
 -               ** to use the directory.
 +              /*
 +               * not a perfect generation count, as object ids can be reused,
 +               * but this is as good as reiserfs can do right now.
 +               * note that the private part of inode isn't filled in yet,
 +               * we have to use the directory.
                 */
                inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
        else
        REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
            U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
  
 -      INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
 +      INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
        REISERFS_I(inode)->i_flags = 0;
        REISERFS_I(inode)->i_prealloc_block = 0;
        REISERFS_I(inode)->i_prealloc_count = 0;
                goto out_bad_inode;
        }
        if (old_format_only(sb)) {
 +              /* i_uid or i_gid is too big to be stored in stat data v3.5 */
                if (i_uid_read(inode) & ~0xffff || i_gid_read(inode) & ~0xffff) {
                        pathrelse(&path_to_key);
 -                      /* i_uid or i_gid is too big to be stored in stat data v3.5 */
                        err = -EINVAL;
                        goto out_bad_inode;
                }
        } else {
                inode2sd(&sd, inode, inode->i_size);
        }
 -      // store in in-core inode the key of stat data and version all
 -      // object items will have (directory items will have old offset
 -      // format, other new objects will consist of new items)
 +      /*
 +       * store in in-core inode the key of stat data and version all
 +       * object items will have (directory items will have old offset
 +       * format, other new objects will consist of new items)
 +       */
        if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
                set_inode_item_key_version(inode, KEY_FORMAT_3_5);
        else
        if (retval) {
                err = retval;
                reiserfs_check_path(&path_to_key);
 -              journal_end(th, th->t_super, th->t_blocks_allocated);
 +              journal_end(th);
                goto out_inserted_sd;
        }
  
                if (retval) {
                        err = retval;
                        reiserfs_check_path(&path_to_key);
 -                      journal_end(th, th->t_super, th->t_blocks_allocated);
 +                      journal_end(th);
                        goto out_inserted_sd;
                }
        } else if (inode->i_sb->s_flags & MS_POSIXACL) {
                if (retval) {
                        err = retval;
                        reiserfs_check_path(&path_to_key);
 -                      retval = journal_end(th, th->t_super,
 -                                           th->t_blocks_allocated);
 +                      retval = journal_end(th);
                        if (retval)
                                err = retval;
                        goto out_inserted_sd;
  
        return 0;
  
 -/* it looks like you can easily compress these two goto targets into
 - * one.  Keeping it like this doesn't actually hurt anything, and they
 - * are place holders for what the quota code actually needs.
 - */
 -      out_bad_inode:
 +out_bad_inode:
        /* Invalidate the object, nothing was inserted yet */
        INODE_PKEY(inode)->k_objectid = 0;
  
        dquot_free_inode(inode);
        reiserfs_write_lock_nested(inode->i_sb, depth);
  
 -      out_end_trans:
 -      journal_end(th, th->t_super, th->t_blocks_allocated);
 -      /* Drop can be outside and it needs more credits so it's better to have it outside */
 +out_end_trans:
 +      journal_end(th);
 +      /*
 +       * Drop can be outside and it needs more credits so it's better
 +       * to have it outside
 +       */
        depth = reiserfs_write_unlock_nested(inode->i_sb);
        dquot_drop(inode);
        reiserfs_write_lock_nested(inode->i_sb, depth);
        inode->i_flags |= S_NOQUOTA;
        make_bad_inode(inode);
  
 -      out_inserted_sd:
 +out_inserted_sd:
        clear_nlink(inode);
        th->t_trans_id = 0;     /* so the caller can't use this handle later */
        unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
  }
  
  /*
 -** finds the tail page in the page cache,
 -** reads the last block in.
 -**
 -** On success, page_result is set to a locked, pinned page, and bh_result
 -** is set to an up to date buffer for the last block in the file.  returns 0.
 -**
 -** tail conversion is not done, so bh_result might not be valid for writing
 -** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
 -** trying to write the block.
 -**
 -** on failure, nonzero is returned, page_result and bh_result are untouched.
 -*/
 + * finds the tail page in the page cache,
 + * reads the last block in.
 + *
 + * On success, page_result is set to a locked, pinned page, and bh_result
 + * is set to an up to date buffer for the last block in the file.  returns 0.
 + *
 + * tail conversion is not done, so bh_result might not be valid for writing
 + * check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
 + * trying to write the block.
 + *
 + * on failure, nonzero is returned, page_result and bh_result are untouched.
 + */
  static int grab_tail_page(struct inode *inode,
                          struct page **page_result,
                          struct buffer_head **bh_result)
  {
  
 -      /* we want the page with the last byte in the file,
 -       ** not the page that will hold the next byte for appending
 +      /*
 +       * we want the page with the last byte in the file,
 +       * not the page that will hold the next byte for appending
         */
        unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
        unsigned long pos = 0;
        struct page *page;
        int error;
  
 -      /* we know that we are only called with inode->i_size > 0.
 -       ** we also know that a file tail can never be as big as a block
 -       ** If i_size % blocksize == 0, our file is currently block aligned
 -       ** and it won't need converting or zeroing after a truncate.
 +      /*
 +       * we know that we are only called with inode->i_size > 0.
 +       * we also know that a file tail can never be as big as a block
 +       * If i_size % blocksize == 0, our file is currently block aligned
 +       * and it won't need converting or zeroing after a truncate.
         */
        if ((offset & (blocksize - 1)) == 0) {
                return -ENOENT;
        } while (bh != head);
  
        if (!buffer_uptodate(bh)) {
 -              /* note, this should never happen, prepare_write should
 -               ** be taking care of this for us.  If the buffer isn't up to date,
 -               ** I've screwed up the code to find the buffer, or the code to
 -               ** call prepare_write
 +              /*
 +               * note, this should never happen, prepare_write should be
 +               * taking care of this for us.  If the buffer isn't up to
 +               * date, I've screwed up the code to find the buffer, or the
 +               * code to call prepare_write
                 */
                reiserfs_error(inode->i_sb, "clm-6000",
                               "error reading block %lu", bh->b_blocknr);
        *bh_result = bh;
        *page_result = page;
  
 -      out:
 +out:
        return error;
  
 -      unlock:
 +unlock:
        unlock_page(page);
        page_cache_release(page);
        return error;
  }
  
  /*
 -** vfs version of truncate file.  Must NOT be called with
 -** a transaction already started.
 -**
 -** some code taken from block_truncate_page
 -*/
 + * vfs version of truncate file.  Must NOT be called with
 + * a transaction already started.
 + *
 + * some code taken from block_truncate_page
 + */
  int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
  {
        struct reiserfs_transaction_handle th;
        if (inode->i_size > 0) {
                error = grab_tail_page(inode, &page, &bh);
                if (error) {
 -                      // -ENOENT means we truncated past the end of the file,
 -                      // and get_block_create_0 could not find a block to read in,
 -                      // which is ok.
 +                      /*
 +                       * -ENOENT means we truncated past the end of the
 +                       * file, and get_block_create_0 could not find a
 +                       * block to read in, which is ok.
 +                       */
                        if (error != -ENOENT)
                                reiserfs_error(inode->i_sb, "clm-6001",
                                               "grab_tail_page failed %d",
                }
        }
  
 -      /* so, if page != NULL, we have a buffer head for the offset at
 -       ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
 -       ** then we have an unformatted node.  Otherwise, we have a direct item,
 -       ** and no zeroing is required on disk.  We zero after the truncate,
 -       ** because the truncate might pack the item anyway
 -       ** (it will unmap bh if it packs).
 +      /*
 +       * so, if page != NULL, we have a buffer head for the offset at
 +       * the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
 +       * then we have an unformatted node.  Otherwise, we have a direct item,
 +       * and no zeroing is required on disk.  We zero after the truncate,
 +       * because the truncate might pack the item anyway
 +       * (it will unmap bh if it packs).
 +       *
 +       * it is enough to reserve space in transaction for 2 balancings:
 +       * one for "save" link adding and another for the first
 +       * cut_from_item. 1 is for update_sd
         */
 -      /* it is enough to reserve space in transaction for 2 balancings:
 -         one for "save" link adding and another for the first
 -         cut_from_item. 1 is for update_sd */
        error = journal_begin(&th, inode->i_sb,
                              JOURNAL_PER_BALANCE_CNT * 2 + 1);
        if (error)
                goto out;
        reiserfs_update_inode_transaction(inode);
        if (update_timestamps)
 -              /* we are doing real truncate: if the system crashes before the last
 -                 transaction of truncating gets committed - on reboot the file
 -                 either appears truncated properly or not truncated at all */
 +              /*
 +               * we are doing real truncate: if the system crashes
 +               * before the last transaction of truncating gets committed
 +               * - on reboot the file either appears truncated properly
 +               * or not truncated at all
 +               */
                add_save_link(&th, inode, 1);
        err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps);
 -      error =
 -          journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
 +      error = journal_end(&th);
        if (error)
                goto out;
  
        reiserfs_write_unlock(inode->i_sb);
  
        return 0;
 -      out:
 +out:
        if (page) {
                unlock_page(page);
                page_cache_release(page);
@@@ -2382,10 -2212,7 +2382,10 @@@ static int map_block_for_writepage(stru
        int copy_size;
        int trans_running = 0;
  
 -      /* catch places below that try to log something without starting a trans */
 +      /*
 +       * catch places below that try to log something without
 +       * starting a trans
 +       */
        th.t_trans_id = 0;
  
        if (!buffer_uptodate(bh_result)) {
        }
  
        kmap(bh_result->b_page);
 -      start_over:
 +start_over:
        reiserfs_write_lock(inode->i_sb);
        make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
  
 -      research:
 +research:
        retval = search_for_position_by_key(inode->i_sb, &key, &path);
        if (retval != POSITION_FOUND) {
                use_get_block = 1;
        }
  
        bh = get_last_bh(&path);
 -      ih = get_ih(&path);
 -      item = get_item(&path);
 +      ih = tp_item_head(&path);
 +      item = tp_item_body(&path);
        pos_in_item = path.pos_in_item;
  
        /* we've found an unformatted node */
                        goto research;
                }
  
 -              memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied,
 +              memcpy(ih_item_body(bh, ih) + pos_in_item, p + bytes_copied,
                       copy_size);
  
 -              journal_mark_dirty(&th, inode->i_sb, bh);
 +              journal_mark_dirty(&th, bh);
                bytes_copied += copy_size;
                set_block_dev_mapped(bh_result, 0, inode);
  
        }
        retval = 0;
  
 -      out:
 +out:
        pathrelse(&path);
        if (trans_running) {
 -              int err = journal_end(&th, inode->i_sb, jbegin_count);
 +              int err = journal_end(&th);
                if (err)
                        retval = err;
                trans_running = 0;
        kunmap(bh_result->b_page);
  
        if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
 -              /* we've copied data from the page into the direct item, so the
 +              /*
 +               * we've copied data from the page into the direct item, so the
                 * buffer in the page is now clean, mark it to reflect that.
                 */
                lock_buffer(bh_result);
@@@ -2544,8 -2370,7 +2544,8 @@@ static int reiserfs_write_full_page(str
                return 0;
        }
  
 -      /* The page dirty bit is cleared before writepage is called, which
 +      /*
 +       * The page dirty bit is cleared before writepage is called, which
         * means we have to tell create_empty_buffers to make dirty buffers
         * The page really should be up to date at this point, so tossing
         * in the BH_Uptodate is just a sanity check.
        }
        head = page_buffers(page);
  
 -      /* last page in the file, zero out any contents past the
 -       ** last byte in the file
 +      /*
 +       * last page in the file, zero out any contents past the
 +       * last byte in the file
         */
        if (page->index >= end_index) {
                unsigned last_offset;
                           (!buffer_mapped(bh) || (buffer_mapped(bh)
                                                       && bh->b_blocknr ==
                                                       0))) {
 -                      /* not mapped yet, or it points to a direct item, search
 +                      /*
 +                       * not mapped yet, or it points to a direct item, search
                         * the btree for the mapping info, and log any direct
                         * items found
                         */
  
                if (checked) {
                        reiserfs_prepare_for_journal(s, bh, 1);
 -                      journal_mark_dirty(&th, s, bh);
 +                      journal_mark_dirty(&th, bh);
                        continue;
                }
 -              /* from this point on, we know the buffer is mapped to a
 +              /*
 +               * from this point on, we know the buffer is mapped to a
                 * real block and not a direct item
                 */
                if (wbc->sync_mode != WB_SYNC_NONE) {
        } while ((bh = bh->b_this_page) != head);
  
        if (checked) {
 -              error = journal_end(&th, s, bh_per_page + 1);
 +              error = journal_end(&th);
                reiserfs_write_unlock(s);
                if (error)
                        goto fail;
        } while (bh != head);
  
        error = 0;
 -      done:
 +done:
        if (nr == 0) {
                /*
                 * if this page only had a direct item, it is very possible for
        }
        return error;
  
 -      fail:
 -      /* catches various errors, we need to make sure any valid dirty blocks
 +fail:
 +      /*
 +       * catches various errors, we need to make sure any valid dirty blocks
         * get to the media.  The page is currently locked and not marked for
         * writeback
         */
                        mark_buffer_async_write(bh);
                } else {
                        /*
 -                       * clear any dirty bits that might have come from getting
 -                       * attached to a dirty page
 +                       * clear any dirty bits that might have come from
 +                       * getting attached to a dirty page
                         */
                        clear_buffer_dirty(bh);
                }
@@@ -2793,18 -2614,15 +2793,18 @@@ static int reiserfs_write_begin(struct 
        ret = __block_write_begin(page, pos, len, reiserfs_get_block);
        if (ret && reiserfs_transaction_running(inode->i_sb)) {
                struct reiserfs_transaction_handle *th = current->journal_info;
 -              /* this gets a little ugly.  If reiserfs_get_block returned an
 -               * error and left a transacstion running, we've got to close it,
 -               * and we've got to free handle if it was a persistent transaction.
 +              /*
 +               * this gets a little ugly.  If reiserfs_get_block returned an
 +               * error and left a transacstion running, we've got to close
 +               * it, and we've got to free handle if it was a persistent
 +               * transaction.
                 *
                 * But, if we had nested into an existing transaction, we need
                 * to just drop the ref count on the handle.
                 *
                 * If old_ref == 0, the transaction is from reiserfs_get_block,
 -               * and it was a persistent trans.  Otherwise, it was nested above.
 +               * and it was a persistent trans.  Otherwise, it was nested
 +               * above.
                 */
                if (th->t_refcount > old_ref) {
                        if (old_ref)
@@@ -2853,18 -2671,15 +2853,18 @@@ int __reiserfs_write_begin(struct page 
        ret = __block_write_begin(page, from, len, reiserfs_get_block);
        if (ret && reiserfs_transaction_running(inode->i_sb)) {
                struct reiserfs_transaction_handle *th = current->journal_info;
 -              /* this gets a little ugly.  If reiserfs_get_block returned an
 -               * error and left a transacstion running, we've got to close it,
 -               * and we've got to free handle if it was a persistent transaction.
 +              /*
 +               * this gets a little ugly.  If reiserfs_get_block returned an
 +               * error and left a transacstion running, we've got to close
 +               * it, and we've got to free handle if it was a persistent
 +               * transaction.
                 *
                 * But, if we had nested into an existing transaction, we need
                 * to just drop the ref count on the handle.
                 *
                 * If old_ref == 0, the transaction is from reiserfs_get_block,
 -               * and it was a persistent trans.  Otherwise, it was nested above.
 +               * and it was a persistent trans.  Otherwise, it was nested
 +               * above.
                 */
                if (th->t_refcount > old_ref) {
                        if (old_ref)
@@@ -2919,20 -2734,17 +2919,20 @@@ static int reiserfs_write_end(struct fi
  
        reiserfs_commit_page(inode, page, start, start + copied);
  
 -      /* generic_commit_write does this for us, but does not update the
 -       ** transaction tracking stuff when the size changes.  So, we have
 -       ** to do the i_size updates here.
 +      /*
 +       * generic_commit_write does this for us, but does not update the
 +       * transaction tracking stuff when the size changes.  So, we have
 +       * to do the i_size updates here.
         */
        if (pos + copied > inode->i_size) {
                struct reiserfs_transaction_handle myth;
                reiserfs_write_lock(inode->i_sb);
                locked = true;
 -              /* If the file have grown beyond the border where it
 -                 can have a tail, unmark it as needing a tail
 -                 packing */
 +              /*
 +               * If the file have grown beyond the border where it
 +               * can have a tail, unmark it as needing a tail
 +               * packing
 +               */
                if ((have_large_tails(inode->i_sb)
                     && inode->i_size > i_block_size(inode) * 4)
                    || (have_small_tails(inode->i_sb)
                inode->i_size = pos + copied;
                /*
                 * this will just nest into our transaction.  It's important
 -               * to use mark_inode_dirty so the inode gets pushed around on the
 -               * dirty lists, and so that O_SYNC works as expected
 +               * to use mark_inode_dirty so the inode gets pushed around on
 +               * the dirty lists, and so that O_SYNC works as expected
                 */
                mark_inode_dirty(inode);
                reiserfs_update_sd(&myth, inode);
                update_sd = 1;
 -              ret = journal_end(&myth, inode->i_sb, 1);
 +              ret = journal_end(&myth);
                if (ret)
                        goto journal_error;
        }
                        goto out;
        }
  
 -      out:
 +out:
        if (locked)
                reiserfs_write_unlock(inode->i_sb);
        unlock_page(page);
  
        return ret == 0 ? copied : ret;
  
 -      journal_error:
 +journal_error:
        reiserfs_write_unlock(inode->i_sb);
        locked = false;
        if (th) {
@@@ -3010,18 -2822,15 +3010,18 @@@ int reiserfs_commit_write(struct file *
        }
        reiserfs_commit_page(inode, page, from, to);
  
 -      /* generic_commit_write does this for us, but does not update the
 -       ** transaction tracking stuff when the size changes.  So, we have
 -       ** to do the i_size updates here.
 +      /*
 +       * generic_commit_write does this for us, but does not update the
 +       * transaction tracking stuff when the size changes.  So, we have
 +       * to do the i_size updates here.
         */
        if (pos > inode->i_size) {
                struct reiserfs_transaction_handle myth;
 -              /* If the file have grown beyond the border where it
 -                 can have a tail, unmark it as needing a tail
 -                 packing */
 +              /*
 +               * If the file have grown beyond the border where it
 +               * can have a tail, unmark it as needing a tail
 +               * packing
 +               */
                if ((have_large_tails(inode->i_sb)
                     && inode->i_size > i_block_size(inode) * 4)
                    || (have_small_tails(inode->i_sb)
                inode->i_size = pos;
                /*
                 * this will just nest into our transaction.  It's important
 -               * to use mark_inode_dirty so the inode gets pushed around on the
 -               * dirty lists, and so that O_SYNC works as expected
 +               * to use mark_inode_dirty so the inode gets pushed around
 +               * on the dirty lists, and so that O_SYNC works as expected
                 */
                mark_inode_dirty(inode);
                reiserfs_update_sd(&myth, inode);
                update_sd = 1;
 -              ret = journal_end(&myth, inode->i_sb, 1);
 +              ret = journal_end(&myth);
                if (ret)
                        goto journal_error;
        }
                        goto out;
        }
  
 -      out:
 +out:
        return ret;
  
 -      journal_error:
 +journal_error:
        if (th) {
                if (!update_sd)
                        reiserfs_update_sd(th, inode);
@@@ -3115,10 -2924,9 +3115,10 @@@ void i_attrs_to_sd_attrs(struct inode *
        }
  }
  
 -/* decide if this buffer needs to stay around for data logging or ordered
 -** write purposes
 -*/
 +/*
 + * decide if this buffer needs to stay around for data logging or ordered
 + * write purposes
 + */
  static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
  {
        int ret = 1;
        if (!buffer_mapped(bh)) {
                goto free_jh;
        }
 -      /* the page is locked, and the only places that log a data buffer
 +      /*
 +       * the page is locked, and the only places that log a data buffer
         * also lock the page.
         */
        if (reiserfs_file_data_log(inode)) {
                struct reiserfs_journal_list *jl;
                struct reiserfs_jh *jh = bh->b_private;
  
 -              /* why is this safe?
 +              /*
 +               * why is this safe?
                 * reiserfs_setattr updates i_size in the on disk
                 * stat data before allowing vmtruncate to be called.
                 *
                    && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
                        ret = 0;
        }
 -      free_jh:
 +free_jh:
        if (ret && bh->b_private) {
                reiserfs_free_jh(bh);
        }
@@@ -3222,7 -3028,7 +3222,7 @@@ static void reiserfs_invalidatepage(str
                ret = try_to_release_page(page, 0);
                /* maybe should BUG_ON(!ret); - neilb */
        }
 -      out:
 +out:
        return;
  }
  
@@@ -3274,20 -3080,18 +3274,20 @@@ static int reiserfs_releasepage(struct 
        return ret;
  }
  
 -/* We thank Mingming Cao for helping us understand in great detail what
 -   to do in this section of the code. */
 +/*
 + * We thank Mingming Cao for helping us understand in great detail what
 + * to do in this section of the code.
 + */
  static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
-                                 const struct iovec *iov, loff_t offset,
-                                 unsigned long nr_segs)
+                                 struct iov_iter *iter, loff_t offset)
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+       size_t count = iov_iter_count(iter);
        ssize_t ret;
  
-       ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-                                 reiserfs_get_blocks_direct_io);
+       ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
+                                reiserfs_get_blocks_direct_io);
  
        /*
         * In case of error extending write may have instantiated a few
         */
        if (unlikely((rw & WRITE) && ret < 0)) {
                loff_t isize = i_size_read(inode);
-               loff_t end = offset + iov_length(iov, nr_segs);
+               loff_t end = offset + count;
  
                if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
                        truncate_setsize(inode, isize);
@@@ -3323,9 -3127,8 +3323,9 @@@ int reiserfs_setattr(struct dentry *den
                dquot_initialize(inode);
        reiserfs_write_lock(inode->i_sb);
        if (attr->ia_valid & ATTR_SIZE) {
 -              /* version 2 items will be caught by the s_maxbytes check
 -               ** done for us in vmtruncate
 +              /*
 +               * version 2 items will be caught by the s_maxbytes check
 +               * done for us in vmtruncate
                 */
                if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
                    attr->ia_size > MAX_NON_LFS) {
                                err = journal_begin(&th, inode->i_sb, 4);
                                if (!err) {
                                        reiserfs_discard_prealloc(&th, inode);
 -                                      err = journal_end(&th, inode->i_sb, 4);
 +                                      err = journal_end(&th);
                                }
                                if (err)
                                        error = err;
                if (error)
                        return error;
  
 -              /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
 +              /*
 +               * (user+group)*(old+new) structure - we count quota
 +               * info and , inode write (sb, inode)
 +               */
                reiserfs_write_lock(inode->i_sb);
                error = journal_begin(&th, inode->i_sb, jbegin_count);
                reiserfs_write_unlock(inode->i_sb);
                error = dquot_transfer(inode, attr);
                reiserfs_write_lock(inode->i_sb);
                if (error) {
 -                      journal_end(&th, inode->i_sb, jbegin_count);
 +                      journal_end(&th);
                        reiserfs_write_unlock(inode->i_sb);
                        goto out;
                }
  
 -              /* Update corresponding info in inode so that everything is in
 -               * one transaction */
 +              /*
 +               * Update corresponding info in inode so that everything
 +               * is in one transaction
 +               */
                if (attr->ia_valid & ATTR_UID)
                        inode->i_uid = attr->ia_uid;
                if (attr->ia_valid & ATTR_GID)
                        inode->i_gid = attr->ia_gid;
                mark_inode_dirty(inode);
 -              error = journal_end(&th, inode->i_sb, jbegin_count);
 +              error = journal_end(&th);
                reiserfs_write_unlock(inode->i_sb);
                if (error)
                        goto out;
            attr->ia_size != i_size_read(inode)) {
                error = inode_newsize_ok(inode, attr->ia_size);
                if (!error) {
 +                      /*
 +                       * Could race against reiserfs_file_release
 +                       * if called from NFS, so take tailpack mutex.
 +                       */
 +                      mutex_lock(&REISERFS_I(inode)->tailpack);
                        truncate_setsize(inode, attr->ia_size);
 -                      reiserfs_vfs_truncate_file(inode);
 +                      reiserfs_truncate_file(inode, 1);
 +                      mutex_unlock(&REISERFS_I(inode)->tailpack);
                }
        }
  
diff --combined fs/ubifs/file.c
index 0ab7f7dfb98b632818a9b1dde1e74f4799633b8b,0888502a60415223ba9285447c9e2f4425bc4076..b5b593c4527005ba50fe0745f2651095dba79331
@@@ -903,9 -903,8 +903,9 @@@ static int do_writepage(struct page *pa
        struct ubifs_info *c = inode->i_sb->s_fs_info;
  
  #ifdef UBIFS_DEBUG
 +      struct ubifs_inode *ui = ubifs_inode(inode);
        spin_lock(&ui->ui_lock);
 -      ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE);
 +      ubifs_assert(page->index <= ui->synced_i_size >> PAGE_CACHE_SHIFT);
        spin_unlock(&ui->ui_lock);
  #endif
  
@@@ -1364,17 -1363,17 +1364,17 @@@ static inline int mctime_update_needed(
  
  /**
   * update_ctime - update mtime and ctime of an inode.
-  * @c: UBIFS file-system description object
   * @inode: inode to update
   *
   * This function updates mtime and ctime of the inode if it is not equivalent to
   * current time. Returns zero in case of success and a negative error code in
   * case of failure.
   */
- static int update_mctime(struct ubifs_info *c, struct inode *inode)
+ static int update_mctime(struct inode *inode)
  {
        struct timespec now = ubifs_current_time(inode);
        struct ubifs_inode *ui = ubifs_inode(inode);
+       struct ubifs_info *c = inode->i_sb->s_fs_info;
  
        if (mctime_update_needed(inode, &now)) {
                int err, release;
        return 0;
  }
  
- static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                              unsigned long nr_segs, loff_t pos)
+ static ssize_t ubifs_write_iter(struct kiocb *iocb, struct iov_iter *from)
  {
-       int err;
-       struct inode *inode = iocb->ki_filp->f_mapping->host;
-       struct ubifs_info *c = inode->i_sb->s_fs_info;
-       err = update_mctime(c, inode);
+       int err = update_mctime(file_inode(iocb->ki_filp));
        if (err)
                return err;
  
-       return generic_file_aio_write(iocb, iov, nr_segs, pos);
+       return generic_file_write_iter(iocb, from);
  }
  
  static int ubifs_set_page_dirty(struct page *page)
@@@ -1526,7 -1520,8 +1521,7 @@@ static int ubifs_vm_page_mkwrite(struc
        }
  
        wait_for_stable_page(page);
 -      unlock_page(page);
 -      return 0;
 +      return VM_FAULT_LOCKED;
  
  out_unlock:
        unlock_page(page);
@@@ -1582,15 -1577,15 +1577,15 @@@ const struct inode_operations ubifs_sym
  
  const struct file_operations ubifs_file_operations = {
        .llseek         = generic_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = generic_file_aio_read,
-       .aio_write      = ubifs_aio_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = generic_file_read_iter,
+       .write_iter     = ubifs_write_iter,
        .mmap           = ubifs_file_mmap,
        .fsync          = ubifs_fsync,
        .unlocked_ioctl = ubifs_ioctl,
        .splice_read    = generic_file_splice_read,
-       .splice_write   = generic_file_splice_write,
+       .splice_write   = iter_file_splice_write,
  #ifdef CONFIG_COMPAT
        .compat_ioctl   = ubifs_compat_ioctl,
  #endif
diff --combined fs/xfs/xfs_aops.c
index e32640eedea6430759310ab7b73ce909ab3bc445,08d13e3952524fdfef6fea570955e51eec3824e4..faaf716e2080ad5d41cd86dd05c1ac8f4e3e2fad
@@@ -975,39 -975,14 +975,39 @@@ xfs_vm_writepage
         * Given that we do not allow direct reclaim to call us, we should
         * never be called while in a filesystem transaction.
         */
 -      if (WARN_ON(current->flags & PF_FSTRANS))
 +      if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
                goto redirty;
  
        /* Is this page beyond the end of the file? */
        offset = i_size_read(inode);
        end_index = offset >> PAGE_CACHE_SHIFT;
        last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
 -      if (page->index >= end_index) {
 +
 +      /*
 +       * The page index is less than the end_index, adjust the end_offset
 +       * to the highest offset that this page should represent.
 +       * -----------------------------------------------------
 +       * |                    file mapping           | <EOF> |
 +       * -----------------------------------------------------
 +       * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
 +       * ^--------------------------------^----------|--------
 +       * |     desired writeback range    |      see else    |
 +       * ---------------------------------^------------------|
 +       */
 +      if (page->index < end_index)
 +              end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
 +      else {
 +              /*
 +               * Check whether the page to write out is beyond or straddles
 +               * i_size or not.
 +               * -------------------------------------------------------
 +               * |            file mapping                    | <EOF>  |
 +               * -------------------------------------------------------
 +               * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
 +               * ^--------------------------------^-----------|---------
 +               * |                                |      Straddles     |
 +               * ---------------------------------^-----------|--------|
 +               */
                unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
  
                /*
                 * truncate operation that is in progress. We must redirty the
                 * page so that reclaim stops reclaiming it. Otherwise
                 * xfs_vm_releasepage() is called on it and gets confused.
 +               *
 +               * Note that the end_index is unsigned long, it would overflow
 +               * if the given offset is greater than 16TB on 32-bit system
 +               * and if we do check the page is fully outside i_size or not
 +               * via "if (page->index >= end_index + 1)" as "end_index + 1"
 +               * will be evaluated to 0.  Hence this page will be redirtied
 +               * and be written out repeatedly which would result in an
 +               * infinite loop, the user program that perform this operation
 +               * will hang.  Instead, we can verify this situation by checking
 +               * if the page to write is totally beyond the i_size or if it's
 +               * offset is just equal to the EOF.
                 */
 -              if (page->index >= end_index + 1 || offset_into_page == 0)
 +              if (page->index > end_index ||
 +                  (page->index == end_index && offset_into_page == 0))
                        goto redirty;
  
                /*
                 * The page straddles i_size.  It must be zeroed out on each
                 * and every writepage invocation because it may be mmapped.
                 * "A file is mapped in multiples of the page size.  For a file
 -               * that is not a multiple of the  page size, the remaining
 +               * that is not a multiple of the page size, the remaining
                 * memory is zeroed when mapped, and writes to that region are
                 * not written out to the file."
                 */
                zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE);
 +
 +              /* Adjust the end_offset to the end of file */
 +              end_offset = offset;
        }
  
 -      end_offset = min_t(unsigned long long,
 -                      (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
 -                      offset);
        len = 1 << inode->i_blkbits;
  
        bh = head = page_buffers(page);
@@@ -1225,9 -1188,9 +1225,9 @@@ xfs_vm_releasepage
  
        xfs_count_page_state(page, &delalloc, &unwritten);
  
 -      if (WARN_ON(delalloc))
 +      if (WARN_ON_ONCE(delalloc))
                return 0;
 -      if (WARN_ON(unwritten))
 +      if (WARN_ON_ONCE(unwritten))
                return 0;
  
        return try_to_free_buffers(page);
@@@ -1486,9 -1449,8 +1486,8 @@@ STATIC ssize_
  xfs_vm_direct_IO(
        int                     rw,
        struct kiocb            *iocb,
-       const struct iovec      *iov,
-       loff_t                  offset,
-       unsigned long           nr_segs)
+       struct iov_iter         *iter,
+       loff_t                  offset)
  {
        struct inode            *inode = iocb->ki_filp->f_mapping->host;
        struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
        ssize_t                 ret;
  
        if (rw & WRITE) {
-               size_t size = iov_length(iov, nr_segs);
+               size_t size = iov_iter_count(iter);
  
                /*
                 * We cannot preallocate a size update transaction here as we
                if (offset + size > XFS_I(inode)->i_d.di_size)
                        ioend->io_isdirect = 1;
  
-               ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
-                                           offset, nr_segs,
-                                           xfs_get_blocks_direct,
+               ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
+                                           offset, xfs_get_blocks_direct,
                                            xfs_end_io_direct_write, NULL,
                                            DIO_ASYNC_EXTEND);
                if (ret != -EIOCBQUEUED && iocb->private)
                        goto out_destroy_ioend;
        } else {
-               ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
-                                           offset, nr_segs,
-                                           xfs_get_blocks_direct,
+               ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
+                                           offset, xfs_get_blocks_direct,
                                            NULL, NULL, 0);
        }
  
diff --combined fs/xfs/xfs_file.c
index 1b8160dc04d120326de6bf39634073b9b7d7e98f,500c3f0656d0a27676955c7cfc757291fbee3d5d..1f66779d7a46628cf3a068dd5c08b36368fb6545
@@@ -229,34 -229,27 +229,27 @@@ xfs_file_fsync
  }
  
  STATIC ssize_t
- xfs_file_aio_read(
+ xfs_file_read_iter(
        struct kiocb            *iocb,
-       const struct iovec      *iovp,
-       unsigned long           nr_segs,
-       loff_t                  pos)
+       struct iov_iter         *to)
  {
        struct file             *file = iocb->ki_filp;
        struct inode            *inode = file->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
-       size_t                  size = 0;
+       size_t                  size = iov_iter_count(to);
        ssize_t                 ret = 0;
        int                     ioflags = 0;
        xfs_fsize_t             n;
+       loff_t                  pos = iocb->ki_pos;
  
        XFS_STATS_INC(xs_read_calls);
  
-       BUG_ON(iocb->ki_pos != pos);
        if (unlikely(file->f_flags & O_DIRECT))
                ioflags |= IO_ISDIRECT;
        if (file->f_mode & FMODE_NOCMTIME)
                ioflags |= IO_INVIS;
  
-       ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE);
-       if (ret < 0)
-               return ret;
        if (unlikely(ioflags & IO_ISDIRECT)) {
                xfs_buftarg_t   *target =
                        XFS_IS_REALTIME_INODE(ip) ?
  
        trace_xfs_file_read(ip, size, pos, ioflags);
  
-       ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
+       ret = generic_file_read_iter(iocb, to);
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
  
@@@ -349,47 -342,6 +342,6 @@@ xfs_file_splice_read
        return ret;
  }
  
- /*
-  * xfs_file_splice_write() does not use xfs_rw_ilock() because
-  * generic_file_splice_write() takes the i_mutex itself. This, in theory,
-  * couuld cause lock inversions between the aio_write path and the splice path
-  * if someone is doing concurrent splice(2) based writes and write(2) based
-  * writes to the same inode. The only real way to fix this is to re-implement
-  * the generic code here with correct locking orders.
-  */
- STATIC ssize_t
- xfs_file_splice_write(
-       struct pipe_inode_info  *pipe,
-       struct file             *outfilp,
-       loff_t                  *ppos,
-       size_t                  count,
-       unsigned int            flags)
- {
-       struct inode            *inode = outfilp->f_mapping->host;
-       struct xfs_inode        *ip = XFS_I(inode);
-       int                     ioflags = 0;
-       ssize_t                 ret;
-       XFS_STATS_INC(xs_write_calls);
-       if (outfilp->f_mode & FMODE_NOCMTIME)
-               ioflags |= IO_INVIS;
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-               return -EIO;
-       xfs_ilock(ip, XFS_IOLOCK_EXCL);
-       trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
-       ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
-       if (ret > 0)
-               XFS_STATS_ADD(xs_write_bytes, ret);
-       xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-       return ret;
- }
  /*
   * This routine is called to handle zeroing any space in the last block of the
   * file that is beyond the EOF.  We do this since the size is being increased
@@@ -625,10 -577,7 +577,7 @@@ restart
  STATIC ssize_t
  xfs_file_dio_aio_write(
        struct kiocb            *iocb,
-       const struct iovec      *iovp,
-       unsigned long           nr_segs,
-       loff_t                  pos,
-       size_t                  ocount)
+       struct iov_iter         *from)
  {
        struct file             *file = iocb->ki_filp;
        struct address_space    *mapping = file->f_mapping;
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
        ssize_t                 ret = 0;
-       size_t                  count = ocount;
        int                     unaligned_io = 0;
        int                     iolock;
+       size_t                  count = iov_iter_count(from);
+       loff_t                  pos = iocb->ki_pos;
        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
  
        ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
        if (ret)
                goto out;
+       iov_iter_truncate(from, count);
  
        if (mapping->nrpages) {
                ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
        }
  
        trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-       ret = generic_file_direct_write(iocb, iovp,
-                       &nr_segs, pos, count, ocount);
+       ret = generic_file_direct_write(iocb, from, pos);
  
  out:
        xfs_rw_iunlock(ip, iolock);
  STATIC ssize_t
  xfs_file_buffered_aio_write(
        struct kiocb            *iocb,
-       const struct iovec      *iovp,
-       unsigned long           nr_segs,
-       loff_t                  pos,
-       size_t                  count)
+       struct iov_iter         *from)
  {
        struct file             *file = iocb->ki_filp;
        struct address_space    *mapping = file->f_mapping;
        ssize_t                 ret;
        int                     enospc = 0;
        int                     iolock = XFS_IOLOCK_EXCL;
-       struct iov_iter         from;
+       loff_t                  pos = iocb->ki_pos;
+       size_t                  count = iov_iter_count(from);
  
        xfs_rw_ilock(ip, iolock);
  
        if (ret)
                goto out;
  
-       iov_iter_init(&from, iovp, nr_segs, count, 0);
+       iov_iter_truncate(from, count);
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
  
  write_retry:
        trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
-       ret = generic_perform_write(file, &from, pos);
+       ret = generic_perform_write(file, from, pos);
        if (likely(ret >= 0))
                iocb->ki_pos = pos + ret;
        /*
@@@ -759,40 -707,29 +707,29 @@@ out
  }
  
  STATIC ssize_t
- xfs_file_aio_write(
+ xfs_file_write_iter(
        struct kiocb            *iocb,
-       const struct iovec      *iovp,
-       unsigned long           nr_segs,
-       loff_t                  pos)
+       struct iov_iter         *from)
  {
        struct file             *file = iocb->ki_filp;
        struct address_space    *mapping = file->f_mapping;
        struct inode            *inode = mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
        ssize_t                 ret;
-       size_t                  ocount = 0;
+       size_t                  ocount = iov_iter_count(from);
  
        XFS_STATS_INC(xs_write_calls);
  
-       BUG_ON(iocb->ki_pos != pos);
-       ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
-       if (ret)
-               return ret;
        if (ocount == 0)
                return 0;
  
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-               ret = -EIO;
-               goto out;
-       }
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+               return -EIO;
  
        if (unlikely(file->f_flags & O_DIRECT))
-               ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
+               ret = xfs_file_dio_aio_write(iocb, from);
        else
-               ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
-                                                 ocount);
+               ret = xfs_file_buffered_aio_write(iocb, from);
  
        if (ret > 0) {
                ssize_t err;
                if (err < 0)
                        ret = err;
        }
- out:
        return ret;
  }
  
@@@ -944,7 -879,7 +879,7 @@@ xfs_dir_open
         */
        mode = xfs_ilock_data_map_shared(ip);
        if (ip->i_d.di_nextents > 0)
 -              xfs_dir3_data_readahead(NULL, ip, 0, -1);
 +              xfs_dir3_data_readahead(ip, 0, -1);
        xfs_iunlock(ip, mode);
        return 0;
  }
@@@ -1461,12 -1396,12 +1396,12 @@@ xfs_file_llseek
  
  const struct file_operations xfs_file_operations = {
        .llseek         = xfs_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = xfs_file_aio_read,
-       .aio_write      = xfs_file_aio_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = xfs_file_read_iter,
+       .write_iter     = xfs_file_write_iter,
        .splice_read    = xfs_file_splice_read,
-       .splice_write   = xfs_file_splice_write,
+       .splice_write   = iter_file_splice_write,
        .unlocked_ioctl = xfs_file_ioctl,
  #ifdef CONFIG_COMPAT
        .compat_ioctl   = xfs_file_compat_ioctl,
diff --combined fs/xfs/xfs_trace.h
index 6910458915cfea9133cc3c39ff56e5f7f775c065,53182f97cf011e62e4806ce4bf3dbdfc71f00835..152f82782630222321bcd234b20c0ffb0a626e34
@@@ -538,64 -538,6 +538,64 @@@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_r
  DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
  DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
  
 +DECLARE_EVENT_CLASS(xfs_filestream_class,
 +      TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno),
 +      TP_ARGS(ip, agno),
 +      TP_STRUCT__entry(
 +              __field(dev_t, dev)
 +              __field(xfs_ino_t, ino)
 +              __field(xfs_agnumber_t, agno)
 +              __field(int, streams)
 +      ),
 +      TP_fast_assign(
 +              __entry->dev = VFS_I(ip)->i_sb->s_dev;
 +              __entry->ino = ip->i_ino;
 +              __entry->agno = agno;
 +              __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno);
 +      ),
 +      TP_printk("dev %d:%d ino 0x%llx agno %u streams %d",
 +                MAJOR(__entry->dev), MINOR(__entry->dev),
 +                __entry->ino,
 +                __entry->agno,
 +                __entry->streams)
 +)
 +#define DEFINE_FILESTREAM_EVENT(name) \
 +DEFINE_EVENT(xfs_filestream_class, name, \
 +      TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), \
 +      TP_ARGS(ip, agno))
 +DEFINE_FILESTREAM_EVENT(xfs_filestream_free);
 +DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup);
 +DEFINE_FILESTREAM_EVENT(xfs_filestream_scan);
 +
 +TRACE_EVENT(xfs_filestream_pick,
 +      TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno,
 +               xfs_extlen_t free, int nscan),
 +      TP_ARGS(ip, agno, free, nscan),
 +      TP_STRUCT__entry(
 +              __field(dev_t, dev)
 +              __field(xfs_ino_t, ino)
 +              __field(xfs_agnumber_t, agno)
 +              __field(int, streams)
 +              __field(xfs_extlen_t, free)
 +              __field(int, nscan)
 +      ),
 +      TP_fast_assign(
 +              __entry->dev = VFS_I(ip)->i_sb->s_dev;
 +              __entry->ino = ip->i_ino;
 +              __entry->agno = agno;
 +              __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno);
 +              __entry->free = free;
 +              __entry->nscan = nscan;
 +      ),
 +      TP_printk("dev %d:%d ino 0x%llx agno %u streams %d free %d nscan %d",
 +                MAJOR(__entry->dev), MINOR(__entry->dev),
 +                __entry->ino,
 +                __entry->agno,
 +                __entry->streams,
 +                __entry->free,
 +                __entry->nscan)
 +);
 +
  DECLARE_EVENT_CLASS(xfs_lock_class,
        TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
                 unsigned long caller_ip),
@@@ -1118,7 -1060,6 +1118,6 @@@ DEFINE_RW_EVENT(xfs_file_read)
  DEFINE_RW_EVENT(xfs_file_buffered_write);
  DEFINE_RW_EVENT(xfs_file_direct_write);
  DEFINE_RW_EVENT(xfs_file_splice_read);
- DEFINE_RW_EVENT(xfs_file_splice_write);
  
  DECLARE_EVENT_CLASS(xfs_page_class,
        TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
index d8e4cea23a257c1b9c8b1514493c407b13d912e0,86df13b97160eb9b4bd8ede7e92158fd7f0825e4..66c2167f04a9d5788b68e97f446757bad082d780
@@@ -5,8 -5,6 +5,6 @@@
  #ifndef __LINUX_BLK_TYPES_H
  #define __LINUX_BLK_TYPES_H
  
- #ifdef CONFIG_BLOCK
  #include <linux/types.h>
  
  struct bio_set;
@@@ -28,6 -26,8 +26,8 @@@ struct bio_vec 
        unsigned int    bv_offset;
  };
  
+ #ifdef CONFIG_BLOCK
  struct bvec_iter {
        sector_t                bi_sector;      /* device address in 512 byte
                                                   sectors */
@@@ -190,7 -190,6 +190,7 @@@ enum rq_flag_bits 
        __REQ_PM,               /* runtime pm request */
        __REQ_END,              /* last of chain of requests */
        __REQ_HASHED,           /* on IO scheduler merge hash */
 +      __REQ_MQ_INFLIGHT,      /* track inflight for MQ */
        __REQ_NR_BITS,          /* stops here */
  };
  
  #define REQ_PM                        (1ULL << __REQ_PM)
  #define REQ_END                       (1ULL << __REQ_END)
  #define REQ_HASHED            (1ULL << __REQ_HASHED)
 +#define REQ_MQ_INFLIGHT               (1ULL << __REQ_MQ_INFLIGHT)
  
  #endif /* __LINUX_BLK_TYPES_H */
diff --combined include/linux/fs.h
index c3f46e499dd0027eed7f2bd0a0bc3cd465c3ac44,4e92d551518d89d61f763775f5416bfc520e7630..338e6f758c6d922be7d8163361da051efa0e3cbc
@@@ -128,6 -128,10 +128,10 @@@ typedef void (dio_iodone_t)(struct kioc
  #define FMODE_ATOMIC_POS      ((__force fmode_t)0x8000)
  /* Write access to underlying fs */
  #define FMODE_WRITER          ((__force fmode_t)0x10000)
+ /* Has read method(s) */
+ #define FMODE_CAN_READ          ((__force fmode_t)0x20000)
+ /* Has write method(s) */
+ #define FMODE_CAN_WRITE         ((__force fmode_t)0x40000)
  
  /* File was opened by fanotify and shouldn't generate fanotify events */
  #define FMODE_NONOTIFY                ((__force fmode_t)0x1000000)
@@@ -343,8 -347,7 +347,7 @@@ struct address_space_operations 
        void (*invalidatepage) (struct page *, unsigned int, unsigned int);
        int (*releasepage) (struct page *, gfp_t);
        void (*freepage)(struct page *);
-       ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
-                       loff_t offset, unsigned long nr_segs);
+       ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
        int (*get_xip_mem)(struct address_space *, pgoff_t, int,
                                                void **, unsigned long *);
        /*
@@@ -1448,6 -1451,8 +1451,8 @@@ struct block_device_operations
  #define HAVE_COMPAT_IOCTL 1
  #define HAVE_UNLOCKED_IOCTL 1
  
+ struct iov_iter;
  struct file_operations {
        struct module *owner;
        loff_t (*llseek) (struct file *, loff_t, int);
        ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
        ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
        ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+       ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
+       ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
        int (*iterate) (struct file *, struct dir_context *);
        unsigned int (*poll) (struct file *, struct poll_table_struct *);
        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
@@@ -2404,20 -2411,18 +2411,18 @@@ extern int generic_file_readonly_mmap(s
  extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
                unsigned long size, pgoff_t pgoff);
  int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
- extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
- extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long);
- extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
- extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
-               unsigned long *, loff_t, size_t, size_t);
+ extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
+ extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
+ extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
+ extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *, loff_t);
  extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
  extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
  extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
- extern int generic_segment_checks(const struct iovec *iov,
              unsigned long *nr_segs, size_t *count, int access_flags);
+ extern ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
extern ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
  
  /* fs/block_dev.c */
- extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                               unsigned long nr_segs, loff_t pos);
+ extern ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from);
  extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
                        int datasync);
  extern void block_sync_page(struct page *page);
@@@ -2427,7 -2432,7 +2432,7 @@@ extern ssize_t generic_file_splice_read
                struct pipe_inode_info *, size_t, unsigned int);
  extern ssize_t default_file_splice_read(struct file *, loff_t *,
                struct pipe_inode_info *, size_t, unsigned int);
- extern ssize_t generic_file_splice_write(struct pipe_inode_info *,
+ extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
                struct file *, loff_t *, size_t, unsigned int);
  extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
                struct file *out, loff_t *, size_t len, unsigned int flags);
@@@ -2477,16 -2482,16 +2482,16 @@@ enum 
  void dio_end_io(struct bio *bio, int error);
  
  ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
-       struct block_device *bdev, const struct iovec *iov, loff_t offset,
-       unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+       struct block_device *bdev, struct iov_iter *iter, loff_t offset,
+       get_block_t get_block, dio_iodone_t end_io,
        dio_submit_t submit_io, int flags);
  
  static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
-               struct inode *inode, const struct iovec *iov, loff_t offset,
-               unsigned long nr_segs, get_block_t get_block)
+               struct inode *inode, struct iov_iter *iter, loff_t offset,
+               get_block_t get_block)
  {
-       return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-                                   offset, nr_segs, get_block, NULL, NULL,
+       return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iter,
+                                   offset, get_block, NULL, NULL,
                                    DIO_LOCKING | DIO_SKIP_HOLES);
  }
  #endif
@@@ -2590,7 -2595,6 +2595,7 @@@ extern ssize_t simple_read_from_buffer(
  extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
                const void __user *from, size_t count);
  
 +extern int __generic_file_fsync(struct file *, loff_t, loff_t, int);
  extern int generic_file_fsync(struct file *, loff_t, loff_t, int);
  
  extern int generic_check_addressable(unsigned, u64);
diff --combined include/linux/nfs_fs.h
index 919576b8e2cfd612d5a2b852f1aa9674811585d1,0a82b6fbae8a4de63683877fce383ae1dee4fcfa..e30f6059ecd642b44c0cc599344c0421b713958f
@@@ -459,13 -459,12 +459,12 @@@ extern int nfs3_removexattr (struct den
  /*
   * linux/fs/nfs/direct.c
   */
- extern ssize_t nfs_direct_IO(int, struct kiocb *, const struct iovec *, loff_t,
-                       unsigned long);
+ extern ssize_t nfs_direct_IO(int, struct kiocb *, struct iov_iter *, loff_t);
  extern ssize_t nfs_file_direct_read(struct kiocb *iocb,
-                       const struct iovec *iov, unsigned long nr_segs,
+                       struct iov_iter *iter,
                        loff_t pos, bool uio);
  extern ssize_t nfs_file_direct_write(struct kiocb *iocb,
-                       const struct iovec *iov, unsigned long nr_segs,
+                       struct iov_iter *iter,
                        loff_t pos, bool uio);
  
  /*
@@@ -520,6 -519,7 +519,6 @@@ extern int  nfs_writepage(struct page *
  extern int  nfs_writepages(struct address_space *, struct writeback_control *);
  extern int  nfs_flush_incompatible(struct file *file, struct page *page);
  extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
 -extern void nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
  
  /*
   * Try to write back everything synchronously (but check the
@@@ -552,6 -552,7 +551,6 @@@ nfs_have_writebacks(struct inode *inode
  extern int  nfs_readpage(struct file *, struct page *);
  extern int  nfs_readpages(struct file *, struct address_space *,
                struct list_head *, unsigned);
 -extern int  nfs_readpage_result(struct rpc_task *, struct nfs_read_data *);
  extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
                               struct page *);
  
diff --combined mm/filemap.c
index 7fadf1c6283844f07727a68fe12ce5f554f2fff6,7499ef19f1c15f4237b695c23d71414eecd97d3a..dafb06f70a09dd97b1fa690969a638f596714091
@@@ -742,7 -742,7 +742,7 @@@ void unlock_page(struct page *page
  {
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        clear_bit_unlock(PG_locked, &page->flags);
 -      smp_mb__after_clear_bit();
 +      smp_mb__after_atomic();
        wake_up_page(page, PG_locked);
  }
  EXPORT_SYMBOL(unlock_page);
   */
  void end_page_writeback(struct page *page)
  {
 -      if (TestClearPageReclaim(page))
 +      /*
 +       * TestClearPageReclaim could be used here but it is an atomic
 +       * operation and overkill in this particular case. Failing to
 +       * shuffle a page marked for immediate reclaim is too mild to
 +       * justify taking an atomic operation penalty at the end of
 +       * ever page writeback.
 +       */
 +      if (PageReclaim(page)) {
 +              ClearPageReclaim(page);
                rotate_reclaimable_page(page);
 +      }
  
        if (!test_clear_page_writeback(page))
                BUG();
  
 -      smp_mb__after_clear_bit();
 +      smp_mb__after_atomic();
        wake_up_page(page, PG_writeback);
  }
  EXPORT_SYMBOL(end_page_writeback);
  
 +/*
 + * After completing I/O on a page, call this routine to update the page
 + * flags appropriately
 + */
 +void page_endio(struct page *page, int rw, int err)
 +{
 +      if (rw == READ) {
 +              if (!err) {
 +                      SetPageUptodate(page);
 +              } else {
 +                      ClearPageUptodate(page);
 +                      SetPageError(page);
 +              }
 +              unlock_page(page);
 +      } else { /* rw == WRITE */
 +              if (err) {
 +                      SetPageError(page);
 +                      if (page->mapping)
 +                              mapping_set_error(page->mapping, err);
 +              }
 +              end_page_writeback(page);
 +      }
 +}
 +EXPORT_SYMBOL_GPL(page_endio);
 +
  /**
   * __lock_page - get a lock on the page, assuming we need to sleep to get it
   * @page: the page to lock
@@@ -990,6 -956,26 +990,6 @@@ out
  }
  EXPORT_SYMBOL(find_get_entry);
  
 -/**
 - * find_get_page - find and get a page reference
 - * @mapping: the address_space to search
 - * @offset: the page index
 - *
 - * Looks up the page cache slot at @mapping & @offset.  If there is a
 - * page cache page, it is returned with an increased refcount.
 - *
 - * Otherwise, %NULL is returned.
 - */
 -struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
 -{
 -      struct page *page = find_get_entry(mapping, offset);
 -
 -      if (radix_tree_exceptional_entry(page))
 -              page = NULL;
 -      return page;
 -}
 -EXPORT_SYMBOL(find_get_page);
 -
  /**
   * find_lock_entry - locate, pin and lock a page cache entry
   * @mapping: the address_space to search
@@@ -1027,84 -1013,66 +1027,84 @@@ repeat
  EXPORT_SYMBOL(find_lock_entry);
  
  /**
 - * find_lock_page - locate, pin and lock a pagecache page
 + * pagecache_get_page - find and get a page reference
   * @mapping: the address_space to search
   * @offset: the page index
 + * @fgp_flags: PCG flags
 + * @gfp_mask: gfp mask to use if a page is to be allocated
   *
 - * Looks up the page cache slot at @mapping & @offset.  If there is a
 - * page cache page, it is returned locked and with an increased
 - * refcount.
 - *
 - * Otherwise, %NULL is returned.
 - *
 - * find_lock_page() may sleep.
 - */
 -struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
 -{
 -      struct page *page = find_lock_entry(mapping, offset);
 -
 -      if (radix_tree_exceptional_entry(page))
 -              page = NULL;
 -      return page;
 -}
 -EXPORT_SYMBOL(find_lock_page);
 -
 -/**
 - * find_or_create_page - locate or add a pagecache page
 - * @mapping: the page's address_space
 - * @index: the page's index into the mapping
 - * @gfp_mask: page allocation mode
 + * Looks up the page cache slot at @mapping & @offset.
   *
 - * Looks up the page cache slot at @mapping & @offset.  If there is a
 - * page cache page, it is returned locked and with an increased
 - * refcount.
 + * PCG flags modify how the page is returned
   *
 - * If the page is not present, a new page is allocated using @gfp_mask
 - * and added to the page cache and the VM's LRU list.  The page is
 - * returned locked and with an increased refcount.
 + * FGP_ACCESSED: the page will be marked accessed
 + * FGP_LOCK: Page is return locked
 + * FGP_CREAT: If page is not present then a new page is allocated using
 + *            @gfp_mask and added to the page cache and the VM's LRU
 + *            list. The page is returned locked and with an increased
 + *            refcount. Otherwise, %NULL is returned.
   *
 - * On memory exhaustion, %NULL is returned.
 + * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
 + * if the GFP flags specified for FGP_CREAT are atomic.
   *
 - * find_or_create_page() may sleep, even if @gfp_flags specifies an
 - * atomic allocation!
 + * If there is a page cache page, it is returned with an increased refcount.
   */
 -struct page *find_or_create_page(struct address_space *mapping,
 -              pgoff_t index, gfp_t gfp_mask)
 +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
 +      int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask)
  {
        struct page *page;
 -      int err;
 +
  repeat:
 -      page = find_lock_page(mapping, index);
 -      if (!page) {
 -              page = __page_cache_alloc(gfp_mask);
 +      page = find_get_entry(mapping, offset);
 +      if (radix_tree_exceptional_entry(page))
 +              page = NULL;
 +      if (!page)
 +              goto no_page;
 +
 +      if (fgp_flags & FGP_LOCK) {
 +              if (fgp_flags & FGP_NOWAIT) {
 +                      if (!trylock_page(page)) {
 +                              page_cache_release(page);
 +                              return NULL;
 +                      }
 +              } else {
 +                      lock_page(page);
 +              }
 +
 +              /* Has the page been truncated? */
 +              if (unlikely(page->mapping != mapping)) {
 +                      unlock_page(page);
 +                      page_cache_release(page);
 +                      goto repeat;
 +              }
 +              VM_BUG_ON_PAGE(page->index != offset, page);
 +      }
 +
 +      if (page && (fgp_flags & FGP_ACCESSED))
 +              mark_page_accessed(page);
 +
 +no_page:
 +      if (!page && (fgp_flags & FGP_CREAT)) {
 +              int err;
 +              if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
 +                      cache_gfp_mask |= __GFP_WRITE;
 +              if (fgp_flags & FGP_NOFS) {
 +                      cache_gfp_mask &= ~__GFP_FS;
 +                      radix_gfp_mask &= ~__GFP_FS;
 +              }
 +
 +              page = __page_cache_alloc(cache_gfp_mask);
                if (!page)
                        return NULL;
 -              /*
 -               * We want a regular kernel memory (not highmem or DMA etc)
 -               * allocation for the radix tree nodes, but we need to honour
 -               * the context-specific requirements the caller has asked for.
 -               * GFP_RECLAIM_MASK collects those requirements.
 -               */
 -              err = add_to_page_cache_lru(page, mapping, index,
 -                      (gfp_mask & GFP_RECLAIM_MASK));
 +
 +              if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
 +                      fgp_flags |= FGP_LOCK;
 +
 +              /* Init accessed so avoit atomic mark_page_accessed later */
 +              if (fgp_flags & FGP_ACCESSED)
 +                      init_page_accessed(page);
 +
 +              err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
                if (unlikely(err)) {
                        page_cache_release(page);
                        page = NULL;
                                goto repeat;
                }
        }
 +
        return page;
  }
 -EXPORT_SYMBOL(find_or_create_page);
 +EXPORT_SYMBOL(pagecache_get_page);
  
  /**
   * find_get_entries - gang pagecache lookup
@@@ -1412,6 -1379,39 +1412,6 @@@ repeat
  }
  EXPORT_SYMBOL(find_get_pages_tag);
  
 -/**
 - * grab_cache_page_nowait - returns locked page at given index in given cache
 - * @mapping: target address_space
 - * @index: the page index
 - *
 - * Same as grab_cache_page(), but do not wait if the page is unavailable.
 - * This is intended for speculative data generators, where the data can
 - * be regenerated if the page couldn't be grabbed.  This routine should
 - * be safe to call while holding the lock for another page.
 - *
 - * Clear __GFP_FS when allocating the page to avoid recursion into the fs
 - * and deadlock against the caller's locked page.
 - */
 -struct page *
 -grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
 -{
 -      struct page *page = find_get_page(mapping, index);
 -
 -      if (page) {
 -              if (trylock_page(page))
 -                      return page;
 -              page_cache_release(page);
 -              return NULL;
 -      }
 -      page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
 -      if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
 -              page_cache_release(page);
 -              page = NULL;
 -      }
 -      return page;
 -}
 -EXPORT_SYMBOL(grab_cache_page_nowait);
 -
  /*
   * CD/DVDs are error prone. When a medium error occurs, the driver may fail
   * a _large_ part of the i/o request. Imagine the worst scenario:
        return written ? written : error;
  }
  
- /*
-  * Performs necessary checks before doing a write
-  * @iov:      io vector request
-  * @nr_segs:  number of segments in the iovec
-  * @count:    number of bytes to write
-  * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
-  *
-  * Adjust number of segments and amount of bytes to write (nr_segs should be
-  * properly initialized first). Returns appropriate error code that caller
-  * should return or zero in case that write should be allowed.
-  */
- int generic_segment_checks(const struct iovec *iov,
-                       unsigned long *nr_segs, size_t *count, int access_flags)
- {
-       unsigned long   seg;
-       size_t cnt = 0;
-       for (seg = 0; seg < *nr_segs; seg++) {
-               const struct iovec *iv = &iov[seg];
-               /*
-                * If any segment has a negative length, or the cumulative
-                * length ever wraps negative then return -EINVAL.
-                */
-               cnt += iv->iov_len;
-               if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
-                       return -EINVAL;
-               if (access_ok(access_flags, iv->iov_base, iv->iov_len))
-                       continue;
-               if (seg == 0)
-                       return -EFAULT;
-               *nr_segs = seg;
-               cnt -= iv->iov_len;     /* This segment is no good */
-               break;
-       }
-       *count = cnt;
-       return 0;
- }
- EXPORT_SYMBOL(generic_segment_checks);
  /**
-  * generic_file_aio_read - generic filesystem read routine
+  * generic_file_read_iter - generic filesystem read routine
   * @iocb:     kernel I/O control block
-  * @iov:      io vector request
-  * @nr_segs:  number of segments in the iovec
-  * @pos:      current file position
+  * @iter:     destination for the data read
   *
-  * This is the "read()" routine for all filesystems
+  * This is the "read_iter()" routine for all filesystems
   * that can use the page cache directly.
   */
  ssize_t
- generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-               unsigned long nr_segs, loff_t pos)
+ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
  {
-       struct file *filp = iocb->ki_filp;
-       ssize_t retval;
-       size_t count;
+       struct file *file = iocb->ki_filp;
+       ssize_t retval = 0;
        loff_t *ppos = &iocb->ki_pos;
-       struct iov_iter i;
-       count = 0;
-       retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-       if (retval)
-               return retval;
-       iov_iter_init(&i, iov, nr_segs, count, 0);
+       loff_t pos = *ppos;
  
        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
-       if (filp->f_flags & O_DIRECT) {
+       if (file->f_flags & O_DIRECT) {
+               struct address_space *mapping = file->f_mapping;
+               struct inode *inode = mapping->host;
+               size_t count = iov_iter_count(iter);
                loff_t size;
-               struct address_space *mapping;
-               struct inode *inode;
  
-               mapping = filp->f_mapping;
-               inode = mapping->host;
                if (!count)
                        goto out; /* skip atime */
                size = i_size_read(inode);
                retval = filemap_write_and_wait_range(mapping, pos,
-                                       pos + iov_length(iov, nr_segs) - 1);
+                                       pos + count - 1);
                if (!retval) {
-                       retval = mapping->a_ops->direct_IO(READ, iocb,
-                                                          iov, pos, nr_segs);
+                       struct iov_iter data = *iter;
+                       retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos);
                }
                if (retval > 0) {
                        *ppos = pos + retval;
-                       count -= retval;
-                       /*
-                        * If we did a short DIO read we need to skip the
-                        * section of the iov that we've already read data into.
-                        */
-                       iov_iter_advance(&i, retval);
+                       iov_iter_advance(iter, retval);
                }
  
                /*
                 * and return.  Otherwise fallthrough to buffered io for
                 * the rest of the read.
                 */
-               if (retval < 0 || !count || *ppos >= size) {
-                       file_accessed(filp);
+               if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) {
+                       file_accessed(file);
                        goto out;
                }
        }
  
-       retval = do_generic_file_read(filp, ppos, &i, retval);
+       retval = do_generic_file_read(file, ppos, iter, retval);
  out:
        return retval;
  }
- EXPORT_SYMBOL(generic_file_aio_read);
+ EXPORT_SYMBOL(generic_file_read_iter);
  
  #ifdef CONFIG_MMU
  /**
@@@ -2381,14 -2327,13 +2327,12 @@@ int pagecache_write_end(struct file *fi
  {
        const struct address_space_operations *aops = mapping->a_ops;
  
 -      mark_page_accessed(page);
        return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
  }
  EXPORT_SYMBOL(pagecache_write_end);
  
  ssize_t
- generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
-               unsigned long *nr_segs, loff_t pos,
-               size_t count, size_t ocount)
+ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
  {
        struct file     *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        ssize_t         written;
        size_t          write_len;
        pgoff_t         end;
+       struct iov_iter data;
  
-       if (count != ocount)
-               *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
-       write_len = iov_length(iov, *nr_segs);
+       write_len = iov_iter_count(from);
        end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
  
        written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
                }
        }
  
-       written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+       data = *from;
+       written = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos);
  
        /*
         * Finally, try again to invalidate clean pages which might have been
  
        if (written > 0) {
                pos += written;
+               iov_iter_advance(from, written);
                if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
                        i_size_write(inode, pos);
                        mark_inode_dirty(inode);
@@@ -2462,18 -2407,34 +2406,18 @@@ EXPORT_SYMBOL(generic_file_direct_write
  struct page *grab_cache_page_write_begin(struct address_space *mapping,
                                        pgoff_t index, unsigned flags)
  {
 -      int status;
 -      gfp_t gfp_mask;
        struct page *page;
 -      gfp_t gfp_notmask = 0;
 +      int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT;
  
 -      gfp_mask = mapping_gfp_mask(mapping);
 -      if (mapping_cap_account_dirty(mapping))
 -              gfp_mask |= __GFP_WRITE;
        if (flags & AOP_FLAG_NOFS)
 -              gfp_notmask = __GFP_FS;
 -repeat:
 -      page = find_lock_page(mapping, index);
 +              fgp_flags |= FGP_NOFS;
 +
 +      page = pagecache_get_page(mapping, index, fgp_flags,
 +                      mapping_gfp_mask(mapping),
 +                      GFP_KERNEL);
        if (page)
 -              goto found;
 +              wait_for_stable_page(page);
  
 -      page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
 -      if (!page)
 -              return NULL;
 -      status = add_to_page_cache_lru(page, mapping, index,
 -                                              GFP_KERNEL & ~gfp_notmask);
 -      if (unlikely(status)) {
 -              page_cache_release(page);
 -              if (status == -EEXIST)
 -                      goto repeat;
 -              return NULL;
 -      }
 -found:
 -      wait_for_stable_page(page);
        return page;
  }
  EXPORT_SYMBOL(grab_cache_page_write_begin);
@@@ -2522,7 -2483,7 +2466,7 @@@ again
  
                status = a_ops->write_begin(file, mapping, pos, bytes, flags,
                                                &page, &fsdata);
 -              if (unlikely(status))
 +              if (unlikely(status < 0))
                        break;
  
                if (mapping_writably_mapped(mapping))
                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
                flush_dcache_page(page);
  
 -              mark_page_accessed(page);
                status = a_ops->write_end(file, mapping, pos, bytes, copied,
                                                page, fsdata);
                if (unlikely(status < 0))
  EXPORT_SYMBOL(generic_perform_write);
  
  /**
-  * __generic_file_aio_write - write data to a file
+  * __generic_file_write_iter - write data to a file
   * @iocb:     IO state structure (file, offset, etc.)
-  * @iov:      vector with data to write
-  * @nr_segs:  number of segments in the vector
+  * @from:     iov_iter with data to write
   *
   * This function does all the work needed for actually writing data to a
   * file. It does all basic checks, removes SUID from the file, updates
   * A caller has to handle it. This is mainly due to the fact that we want to
   * avoid syncing under i_mutex.
   */
- ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                                unsigned long nr_segs)
+ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  {
        struct file *file = iocb->ki_filp;
        struct address_space * mapping = file->f_mapping;
-       size_t ocount;          /* original count */
-       size_t count;           /* after file limit checks */
        struct inode    *inode = mapping->host;
        loff_t          pos = iocb->ki_pos;
        ssize_t         written = 0;
        ssize_t         err;
        ssize_t         status;
-       struct iov_iter from;
-       ocount = 0;
-       err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-       if (err)
-               return err;
-       count = ocount;
+       size_t          count = iov_iter_count(from);
  
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
        if (count == 0)
                goto out;
  
+       iov_iter_truncate(from, count);
        err = file_remove_suid(file);
        if (err)
                goto out;
        if (err)
                goto out;
  
-       iov_iter_init(&from, iov, nr_segs, count, 0);
        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (unlikely(file->f_flags & O_DIRECT)) {
                loff_t endbyte;
  
-               written = generic_file_direct_write(iocb, iov, &from.nr_segs, pos,
-                                                       count, ocount);
+               written = generic_file_direct_write(iocb, from, pos);
                if (written < 0 || written == count)
                        goto out;
-               iov_iter_advance(&from, written);
  
                /*
                 * direct-io write to a hole: fall through to buffered I/O
                pos += written;
                count -= written;
  
-               status = generic_perform_write(file, &from, pos);
+               status = generic_perform_write(file, from, pos);
                /*
                 * If generic_perform_write() returned a synchronous error
                 * then we want to return the number of bytes which were
                         */
                }
        } else {
-               written = generic_perform_write(file, &from, pos);
+               written = generic_perform_write(file, from, pos);
                if (likely(written >= 0))
                        iocb->ki_pos = pos + written;
        }
        current->backing_dev_info = NULL;
        return written ? written : err;
  }
- EXPORT_SYMBOL(__generic_file_aio_write);
+ EXPORT_SYMBOL(__generic_file_write_iter);
  
  /**
-  * generic_file_aio_write - write data to a file
+  * generic_file_write_iter - write data to a file
   * @iocb:     IO state structure
-  * @iov:      vector with data to write
-  * @nr_segs:  number of segments in the vector
-  * @pos:      position in file where to write
+  * @from:     iov_iter with data to write
   *
-  * This is a wrapper around __generic_file_aio_write() to be used by most
+  * This is a wrapper around __generic_file_write_iter() to be used by most
   * filesystems. It takes care of syncing the file in case of O_SYNC file
   * and acquires i_mutex as needed.
   */
- ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-               unsigned long nr_segs, loff_t pos)
+ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;
  
-       BUG_ON(iocb->ki_pos != pos);
        mutex_lock(&inode->i_mutex);
-       ret = __generic_file_aio_write(iocb, iov, nr_segs);
+       ret = __generic_file_write_iter(iocb, from);
        mutex_unlock(&inode->i_mutex);
  
        if (ret > 0) {
        }
        return ret;
  }
- EXPORT_SYMBOL(generic_file_aio_write);
+ EXPORT_SYMBOL(generic_file_write_iter);
  
  /**
   * try_to_release_page() - release old fs-specific metadata on a page
diff --combined mm/page_io.c
index 58b50d2901fe2a43a916bae15daffb5c8c6f1cbe,33bb38c4aad716b326c299d63a6cb72b74759bbc..243a9b76e5cee9257d499311c21153084980c39a
@@@ -248,25 -248,28 +248,33 @@@ out
        return ret;
  }
  
 +static sector_t swap_page_sector(struct page *page)
 +{
 +      return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9);
 +}
 +
  int __swap_writepage(struct page *page, struct writeback_control *wbc,
        void (*end_write_func)(struct bio *, int))
  {
        struct bio *bio;
 -      int ret = 0, rw = WRITE;
 +      int ret, rw = WRITE;
        struct swap_info_struct *sis = page_swap_info(page);
  
        if (sis->flags & SWP_FILE) {
                struct kiocb kiocb;
                struct file *swap_file = sis->swap_file;
                struct address_space *mapping = swap_file->f_mapping;
-               struct iovec iov = {
-                       .iov_base = kmap(page),
-                       .iov_len  = PAGE_SIZE,
+               struct bio_vec bv = {
+                       .bv_page = page,
+                       .bv_len  = PAGE_SIZE,
+                       .bv_offset = 0
+               };
+               struct iov_iter from = {
+                       .type = ITER_BVEC | WRITE,
+                       .count = PAGE_SIZE,
+                       .iov_offset = 0,
+                       .nr_segs = 1,
+                       .bvec = &bv
                };
  
                init_sync_kiocb(&kiocb, swap_file);
  
                set_page_writeback(page);
                unlock_page(page);
-               ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
-                                               &kiocb, &iov,
-                                               kiocb.ki_pos, 1);
-               kunmap(page);
+               ret = mapping->a_ops->direct_IO(ITER_BVEC | WRITE,
+                                               &kiocb, &from,
+                                               kiocb.ki_pos);
                if (ret == PAGE_SIZE) {
                        count_vm_event(PSWPOUT);
                        ret = 0;
                return ret;
        }
  
 +      ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
 +      if (!ret) {
 +              count_vm_event(PSWPOUT);
 +              return 0;
 +      }
 +
 +      ret = 0;
        bio = get_swap_bio(GFP_NOIO, page, end_write_func);
        if (bio == NULL) {
                set_page_dirty(page);
@@@ -350,13 -345,6 +357,13 @@@ int swap_readpage(struct page *page
                return ret;
        }
  
 +      ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
 +      if (!ret) {
 +              count_vm_event(PSWPIN);
 +              return 0;
 +      }
 +
 +      ret = 0;
        bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
        if (bio == NULL) {
                unlock_page(page);
diff --combined mm/shmem.c
index 5402481c28d190a83718f6b1897eec78df4c0209,de834ab8b6b90a0a406da4c96278ef946702e7ad..f484c276e994923a5c05577b42d5a9dcc58ae7cc
@@@ -1132,7 -1132,7 +1132,7 @@@ repeat
                        goto decused;
                }
  
 -              SetPageSwapBacked(page);
 +              __SetPageSwapBacked(page);
                __set_page_locked(page);
                error = mem_cgroup_charge_file(page, current->mm,
                                                gfp & GFP_RECLAIM_MASK);
@@@ -1372,13 -1372,9 +1372,13 @@@ shmem_write_begin(struct file *file, st
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
  {
 +      int ret;
        struct inode *inode = mapping->host;
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 -      return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
 +      ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
 +      if (ret == 0 && *pagep)
 +              init_page_accessed(*pagep);
 +      return ret;
  }
  
  static int
@@@ -1406,8 -1402,7 +1406,7 @@@ shmem_write_end(struct file *file, stru
        return copied;
  }
  
- static ssize_t shmem_file_aio_read(struct kiocb *iocb,
-               const struct iovec *iov, unsigned long nr_segs, loff_t pos)
+ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        unsigned long offset;
        enum sgp_type sgp = SGP_READ;
        int error = 0;
-       ssize_t retval;
-       size_t count;
+       ssize_t retval = 0;
        loff_t *ppos = &iocb->ki_pos;
-       struct iov_iter iter;
-       retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-       if (retval)
-               return retval;
-       iov_iter_init(&iter, iov, nr_segs, count, 0);
  
        /*
         * Might this read be for a stacking filesystem?  Then when reading
                 * Ok, we have the page, and it's up-to-date, so
                 * now we can copy it to user space...
                 */
-               ret = copy_page_to_iter(page, offset, nr, &iter);
+               ret = copy_page_to_iter(page, offset, nr, to);
                retval += ret;
                offset += ret;
                index += offset >> PAGE_CACHE_SHIFT;
                offset &= ~PAGE_CACHE_MASK;
  
                page_cache_release(page);
-               if (!iov_iter_count(&iter))
+               if (!iov_iter_count(to))
                        break;
                if (ret < nr) {
                        error = -EFAULT;
@@@ -2629,13 -2617,13 +2621,13 @@@ static const struct file_operations shm
        .mmap           = shmem_mmap,
  #ifdef CONFIG_TMPFS
        .llseek         = shmem_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = shmem_file_aio_read,
-       .aio_write      = generic_file_aio_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = shmem_file_read_iter,
+       .write_iter     = generic_file_write_iter,
        .fsync          = noop_fsync,
        .splice_read    = shmem_file_splice_read,
-       .splice_write   = generic_file_splice_write,
+       .splice_write   = iter_file_splice_write,
        .fallocate      = shmem_fallocate,
  #endif
  };
diff --combined mm/vmscan.c
index e01ded365440704dbec95f0ec8f56326d646b9c2,9c2dba6ac68541397b5f3c9658c1a898284833cf..0f16ffe8eb67c6fcd0350add4a5a4b6092cb6905
@@@ -11,8 -11,6 +11,8 @@@
   *  Multiqueue VM started 5.8.00, Rik van Riel.
   */
  
 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 +
  #include <linux/mm.h>
  #include <linux/module.h>
  #include <linux/gfp.h>
@@@ -45,7 -43,6 +45,7 @@@
  #include <linux/sysctl.h>
  #include <linux/oom.h>
  #include <linux/prefetch.h>
 +#include <linux/printk.h>
  
  #include <asm/tlbflush.h>
  #include <asm/div64.h>
@@@ -86,9 -83,6 +86,9 @@@ struct scan_control 
        /* Scan (total_size >> priority) pages at once */
        int priority;
  
 +      /* anon vs. file LRUs scanning "ratio" */
 +      int swappiness;
 +
        /*
         * The memory cgroup that hit its limit and as a result is the
         * primary target of this reclaim invocation.
@@@ -330,7 -324,7 +330,7 @@@ shrink_slab_node(struct shrink_control 
        else
                new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
  
 -      trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
 +      trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
        return freed;
  }
  
@@@ -464,7 -458,7 +464,7 @@@ static pageout_t pageout(struct page *p
         * stalls if we need to run get_block().  We could test
         * PagePrivate for that.
         *
-        * If this process is currently in __generic_file_aio_write() against
+        * If this process is currently in __generic_file_write_iter() against
         * this page's queue, we can perform writeback even if that
         * will block.
         *
                if (page_has_private(page)) {
                        if (try_to_free_buffers(page)) {
                                ClearPageDirty(page);
 -                              printk("%s: orphaned page\n", __func__);
 +                              pr_info("%s: orphaned page\n", __func__);
                                return PAGE_CLEAN;
                        }
                }
@@@ -1127,7 -1121,7 +1127,7 @@@ keep
                VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
        }
  
 -      free_hot_cold_page_list(&free_pages, 1);
 +      free_hot_cold_page_list(&free_pages, true);
  
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
@@@ -1444,19 -1438,6 +1444,19 @@@ putback_inactive_pages(struct lruvec *l
        list_splice(&pages_to_free, page_list);
  }
  
 +/*
 + * If a kernel thread (such as nfsd for loop-back mounts) services
 + * a backing device by writing to the page cache it sets PF_LESS_THROTTLE.
 + * In that case we should only throttle if the backing device it is
 + * writing to is congested.  In other cases it is safe to throttle.
 + */
 +static int current_may_throttle(void)
 +{
 +      return !(current->flags & PF_LESS_THROTTLE) ||
 +              current->backing_dev_info == NULL ||
 +              bdi_write_congested(current->backing_dev_info);
 +}
 +
  /*
   * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
   * of reclaimed pages
@@@ -1538,7 -1519,7 +1538,7 @@@ shrink_inactive_list(unsigned long nr_t
  
        spin_unlock_irq(&zone->lru_lock);
  
 -      free_hot_cold_page_list(&page_list, 1);
 +      free_hot_cold_page_list(&page_list, true);
  
        /*
         * If reclaim is isolating dirty pages under writeback, it implies
                 * If dirty pages are scanned that are not queued for IO, it
                 * implies that flushers are not keeping up. In this case, flag
                 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
 -               * pages from reclaim context. It will forcibly stall in the
 -               * next check.
 +               * pages from reclaim context.
                 */
                if (nr_unqueued_dirty == nr_taken)
                        zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
  
                /*
 -               * In addition, if kswapd scans pages marked marked for
 -               * immediate reclaim and under writeback (nr_immediate), it
 -               * implies that pages are cycling through the LRU faster than
 +               * If kswapd scans pages marked marked for immediate
 +               * reclaim and under writeback (nr_immediate), it implies
 +               * that pages are cycling through the LRU faster than
                 * they are written so also forcibly stall.
                 */
 -              if (nr_unqueued_dirty == nr_taken || nr_immediate)
 +              if (nr_immediate && current_may_throttle())
                        congestion_wait(BLK_RW_ASYNC, HZ/10);
        }
  
         * is congested. Allow kswapd to continue until it starts encountering
         * unqueued dirty pages or cycling through the LRU too quickly.
         */
 -      if (!sc->hibernation_mode && !current_is_kswapd())
 +      if (!sc->hibernation_mode && !current_is_kswapd() &&
 +          current_may_throttle())
                wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
  
        trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
@@@ -1759,7 -1740,7 +1759,7 @@@ static void shrink_active_list(unsigne
        __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
        spin_unlock_irq(&zone->lru_lock);
  
 -      free_hot_cold_page_list(&l_hold, 1);
 +      free_hot_cold_page_list(&l_hold, true);
  }
  
  #ifdef CONFIG_SWAP
@@@ -1849,6 -1830,13 +1849,6 @@@ static unsigned long shrink_list(enum l
        return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
  }
  
 -static int vmscan_swappiness(struct scan_control *sc)
 -{
 -      if (global_reclaim(sc))
 -              return vm_swappiness;
 -      return mem_cgroup_swappiness(sc->target_mem_cgroup);
 -}
 -
  enum scan_balance {
        SCAN_EQUAL,
        SCAN_FRACT,
@@@ -1878,8 -1866,6 +1878,8 @@@ static void get_scan_count(struct lruve
        bool force_scan = false;
        unsigned long ap, fp;
        enum lru_list lru;
 +      bool some_scanned;
 +      int pass;
  
        /*
         * If the zone or memcg is small, nr[l] can be 0.  This
         * using the memory controller's swap limit feature would be
         * too expensive.
         */
 -      if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
 +      if (!global_reclaim(sc) && !sc->swappiness) {
                scan_balance = SCAN_FILE;
                goto out;
        }
         * system is close to OOM, scan both anon and file equally
         * (unless the swappiness setting disagrees with swapping).
         */
 -      if (!sc->priority && vmscan_swappiness(sc)) {
 +      if (!sc->priority && sc->swappiness) {
                scan_balance = SCAN_EQUAL;
                goto out;
        }
         * With swappiness at 100, anonymous and file have the same priority.
         * This scanning priority is essentially the inverse of IO cost.
         */
 -      anon_prio = vmscan_swappiness(sc);
 +      anon_prio = sc->swappiness;
        file_prio = 200 - anon_prio;
  
        /*
        fraction[1] = fp;
        denominator = ap + fp + 1;
  out:
 -      for_each_evictable_lru(lru) {
 -              int file = is_file_lru(lru);
 -              unsigned long size;
 -              unsigned long scan;
 +      some_scanned = false;
 +      /* Only use force_scan on second pass. */
 +      for (pass = 0; !some_scanned && pass < 2; pass++) {
 +              for_each_evictable_lru(lru) {
 +                      int file = is_file_lru(lru);
 +                      unsigned long size;
 +                      unsigned long scan;
  
 -              size = get_lru_size(lruvec, lru);
 -              scan = size >> sc->priority;
 +                      size = get_lru_size(lruvec, lru);
 +                      scan = size >> sc->priority;
  
 -              if (!scan && force_scan)
 -                      scan = min(size, SWAP_CLUSTER_MAX);
 +                      if (!scan && pass && force_scan)
 +                              scan = min(size, SWAP_CLUSTER_MAX);
  
 -              switch (scan_balance) {
 -              case SCAN_EQUAL:
 -                      /* Scan lists relative to size */
 -                      break;
 -              case SCAN_FRACT:
 +                      switch (scan_balance) {
 +                      case SCAN_EQUAL:
 +                              /* Scan lists relative to size */
 +                              break;
 +                      case SCAN_FRACT:
 +                              /*
 +                               * Scan types proportional to swappiness and
 +                               * their relative recent reclaim efficiency.
 +                               */
 +                              scan = div64_u64(scan * fraction[file],
 +                                                      denominator);
 +                              break;
 +                      case SCAN_FILE:
 +                      case SCAN_ANON:
 +                              /* Scan one type exclusively */
 +                              if ((scan_balance == SCAN_FILE) != file)
 +                                      scan = 0;
 +                              break;
 +                      default:
 +                              /* Look ma, no brain */
 +                              BUG();
 +                      }
 +                      nr[lru] = scan;
                        /*
 -                       * Scan types proportional to swappiness and
 -                       * their relative recent reclaim efficiency.
 +                       * Skip the second pass and don't force_scan,
 +                       * if we found something to scan.
                         */
 -                      scan = div64_u64(scan * fraction[file], denominator);
 -                      break;
 -              case SCAN_FILE:
 -              case SCAN_ANON:
 -                      /* Scan one type exclusively */
 -                      if ((scan_balance == SCAN_FILE) != file)
 -                              scan = 0;
 -                      break;
 -              default:
 -                      /* Look ma, no brain */
 -                      BUG();
 +                      some_scanned |= !!scan;
                }
 -              nr[lru] = scan;
        }
  }
  
@@@ -2061,27 -2037,13 +2061,27 @@@ static void shrink_lruvec(struct lruve
        unsigned long nr_reclaimed = 0;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
        struct blk_plug plug;
 -      bool scan_adjusted = false;
 +      bool scan_adjusted;
  
        get_scan_count(lruvec, sc, nr);
  
        /* Record the original scan target for proportional adjustments later */
        memcpy(targets, nr, sizeof(nr));
  
 +      /*
 +       * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
 +       * event that can occur when there is little memory pressure e.g.
 +       * multiple streaming readers/writers. Hence, we do not abort scanning
 +       * when the requested number of pages are reclaimed when scanning at
 +       * DEF_PRIORITY on the assumption that the fact we are direct
 +       * reclaiming implies that kswapd is not keeping up and it is best to
 +       * do a batch of work at once. For memcg reclaim one check is made to
 +       * abort proportional reclaim if either the file or anon lru has already
 +       * dropped to zero at the first pass.
 +       */
 +      scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
 +                       sc->priority == DEF_PRIORITY);
 +
        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
                if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
                        continue;
  
 -              /*
 -               * For global direct reclaim, reclaim only the number of pages
 -               * requested. Less care is taken to scan proportionally as it
 -               * is more important to minimise direct reclaim stall latency
 -               * than it is to properly age the LRU lists.
 -               */
 -              if (global_reclaim(sc) && !current_is_kswapd())
 -                      break;
 -
                /*
                 * For kswapd and memcg, reclaim at least the number of pages
 -               * requested. Ensure that the anon and file LRUs shrink
 +               * requested. Ensure that the anon and file LRUs are scanned
                 * proportionally what was requested by get_scan_count(). We
                 * stop reclaiming one LRU and reduce the amount scanning
                 * proportional to the original scan target.
                nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
                nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
  
 +              /*
 +               * It's just vindictive to attack the larger once the smaller
 +               * has gone to zero.  And given the way we stop scanning the
 +               * smaller below, this makes sure that we only make one nudge
 +               * towards proportionality once we've got nr_to_reclaim.
 +               */
 +              if (!nr_file || !nr_anon)
 +                      break;
 +
                if (nr_file > nr_anon) {
                        unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
                                                targets[LRU_ACTIVE_ANON] + 1;
@@@ -2262,7 -2224,6 +2262,7 @@@ static void shrink_zone(struct zone *zo
  
                        lruvec = mem_cgroup_zone_lruvec(zone, memcg);
  
 +                      sc->swappiness = mem_cgroup_swappiness(memcg);
                        shrink_lruvec(lruvec, sc);
  
                        /*
@@@ -2307,8 -2268,9 +2307,8 @@@ static inline bool compaction_ready(str
         * there is a buffer of free pages available to give compaction
         * a reasonable chance of completing and allocating the page
         */
 -      balance_gap = min(low_wmark_pages(zone),
 -              (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
 -                      KSWAPD_ZONE_BALANCE_GAP_RATIO);
 +      balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
 +                      zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
        watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
        watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
  
@@@ -2563,17 -2525,10 +2563,17 @@@ static bool pfmemalloc_watermark_ok(pg_
  
        for (i = 0; i <= ZONE_NORMAL; i++) {
                zone = &pgdat->node_zones[i];
 +              if (!populated_zone(zone))
 +                      continue;
 +
                pfmemalloc_reserve += min_wmark_pages(zone);
                free_pages += zone_page_state(zone, NR_FREE_PAGES);
        }
  
 +      /* If there are no reserves (unexpected config) then do not throttle */
 +      if (!pfmemalloc_reserve)
 +              return true;
 +
        wmark_ok = free_pages > pfmemalloc_reserve / 2;
  
        /* kswapd must be awake if processes are being throttled */
  static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
                                        nodemask_t *nodemask)
  {
 +      struct zoneref *z;
        struct zone *zone;
 -      int high_zoneidx = gfp_zone(gfp_mask);
 -      pg_data_t *pgdat;
 +      pg_data_t *pgdat = NULL;
  
        /*
         * Kernel threads should not be throttled as they may be indirectly
        if (fatal_signal_pending(current))
                goto out;
  
 -      /* Check if the pfmemalloc reserves are ok */
 -      first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
 -      pgdat = zone->zone_pgdat;
 -      if (pfmemalloc_watermark_ok(pgdat))
 +      /*
 +       * Check if the pfmemalloc reserves are ok by finding the first node
 +       * with a usable ZONE_NORMAL or lower zone. The expectation is that
 +       * GFP_KERNEL will be required for allocating network buffers when
 +       * swapping over the network so ZONE_HIGHMEM is unusable.
 +       *
 +       * Throttling is based on the first usable node and throttled processes
 +       * wait on a queue until kswapd makes progress and wakes them. There
 +       * is an affinity then between processes waking up and where reclaim
 +       * progress has been made assuming the process wakes on the same node.
 +       * More importantly, processes running on remote nodes will not compete
 +       * for remote pfmemalloc reserves and processes on different nodes
 +       * should make reasonable progress.
 +       */
 +      for_each_zone_zonelist_nodemask(zone, z, zonelist,
 +                                      gfp_mask, nodemask) {
 +              if (zone_idx(zone) > ZONE_NORMAL)
 +                      continue;
 +
 +              /* Throttle based on the first usable node */
 +              pgdat = zone->zone_pgdat;
 +              if (pfmemalloc_watermark_ok(pgdat))
 +                      goto out;
 +              break;
 +      }
 +
 +      /* If no zone was usable by the allocation flags then do not throttle */
 +      if (!pgdat)
                goto out;
  
        /* Account for the throttling */
@@@ -2729,7 -2660,6 +2729,7 @@@ unsigned long mem_cgroup_shrink_node_zo
                .may_swap = !noswap,
                .order = 0,
                .priority = 0,
 +              .swappiness = mem_cgroup_swappiness(memcg),
                .target_mem_cgroup = memcg,
        };
        struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
@@@ -2961,8 -2891,9 +2961,8 @@@ static bool kswapd_shrink_zone(struct z
         * high wmark plus a "gap" where the gap is either the low
         * watermark or 1% of the zone, whichever is smaller.
         */
 -      balance_gap = min(low_wmark_pages(zone),
 -              (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
 -              KSWAPD_ZONE_BALANCE_GAP_RATIO);
 +      balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
 +                      zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
  
        /*
         * If there is no low memory pressure or the zone is balanced then no
@@@ -3371,10 -3302,7 +3371,10 @@@ static int kswapd(void *p
                }
        }
  
 +      tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
        current->reclaim_state = NULL;
 +      lockdep_clear_current_reclaim_state();
 +
        return 0;
  }
  
@@@ -3494,7 -3422,7 +3494,7 @@@ int kswapd_run(int nid
  
  /*
   * Called by memory hotplug when all memory in a node is offlined.  Caller must
 - * hold lock_memory_hotplug().
 + * hold mem_hotplug_begin/end().
   */
  void kswapd_stop(int nid)
  {