Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 12 Jun 2014 17:30:18 +0000 (10:30 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 12 Jun 2014 17:30:18 +0000 (10:30 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Jun 2014 17:30:18 +0000 (10:30 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Jun 2014 17:30:18 +0000 (10:30 -0700)
diff --combined drivers/staging/lustre/lustre/lclient/lcommon_cl.c

index dc24cfa5803722dd86669102189c27d793b83ecf,a07d5156bc50c017f8d942b090c7ce901735bb94..1b0c216bc5687742198c89c8d6f21b07036a11bb
--- 1/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
--- 2/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
+++ b/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
@@@ -63,7 -63,7 +63,7 @@@
   
   #include "../llite/llite_internal.h"
   
- -const struct cl_req_operations ccc_req_ops;
+ +static const struct cl_req_operations ccc_req_ops;
   
   /*
    * ccc_ prefix stands for "Common Client Code".
@@@ -112,11 -112,12 +112,11 @@@ static struct lu_kmem_descr ccc_caches[
    *
    */
   
- -void *ccc_key_init(const struct lu_context *ctx,
- -                        struct lu_context_key *key)
+ +void *ccc_key_init(const struct lu_context *ctx, struct lu_context_key *key)
   {
         struct ccc_thread_info *info;
   
- -      OBD_SLAB_ALLOC_PTR_GFP(info, ccc_thread_kmem, __GFP_IO);
+ +      OBD_SLAB_ALLOC_PTR_GFP(info, ccc_thread_kmem, GFP_NOFS);
         if (info == NULL)
                 info = ERR_PTR(-ENOMEM);
         return info;
@@@ -134,7 -135,7 +134,7 @@@ void *ccc_session_key_init(const struc
   {
         struct ccc_session *session;
   
- -      OBD_SLAB_ALLOC_PTR_GFP(session, ccc_session_kmem, __GFP_IO);
+ +      OBD_SLAB_ALLOC_PTR_GFP(session, ccc_session_kmem, GFP_NOFS);
         if (session == NULL)
                 session = ERR_PTR(-ENOMEM);
         return session;
@@@ -250,7 -251,7 +250,7 @@@ int ccc_req_init(const struct lu_env *e
         struct ccc_req *vrq;
         int result;
   
- -      OBD_SLAB_ALLOC_PTR_GFP(vrq, ccc_req_kmem, __GFP_IO);
+ +      OBD_SLAB_ALLOC_PTR_GFP(vrq, ccc_req_kmem, GFP_NOFS);
         if (vrq != NULL) {
                 cl_req_slice_add(req, &vrq->crq_cl, dev, &ccc_req_ops);
                 result = 0;
@@@ -326,7 -327,7 +326,7 @@@ struct lu_object *ccc_object_alloc(cons
         struct ccc_object *vob;
         struct lu_object  *obj;
   
- -      OBD_SLAB_ALLOC_PTR_GFP(vob, ccc_object_kmem, __GFP_IO);
+ +      OBD_SLAB_ALLOC_PTR_GFP(vob, ccc_object_kmem, GFP_NOFS);
         if (vob != NULL) {
                 struct cl_object_header *hdr;
   
@@@ -395,7 -396,7 +395,7 @@@ int ccc_lock_init(const struct lu_env *
   
         CLOBINVRNT(env, obj, ccc_object_invariant(obj));
   
- -      OBD_SLAB_ALLOC_PTR_GFP(clk, ccc_lock_kmem, __GFP_IO);
+ +      OBD_SLAB_ALLOC_PTR_GFP(clk, ccc_lock_kmem, GFP_NOFS);
         if (clk != NULL) {
                 cl_lock_slice_add(lock, &clk->clk_cl, obj, lkops);
                 result = 0;
@@@ -720,31 -721,12 +720,12 @@@ int ccc_io_one_lock_index(const struct 
   void ccc_io_update_iov(const struct lu_env *env,
                        struct ccc_io *cio, struct cl_io *io)
   {
-       int i;
         size_t size = io->u.ci_rw.crw_count;
   
-       cio->cui_iov_olen = 0;
-       if (!cl_is_normalio(env, io) || cio->cui_tot_nrsegs == 0)
+       if (!cl_is_normalio(env, io) || cio->cui_iter == NULL)
                 return;
   
-       for (i = 0; i < cio->cui_tot_nrsegs; i++) {
-               struct iovec *iv = &cio->cui_iov[i];
- 
-               if (iv->iov_len < size)
-                       size -= iv->iov_len;
-               else {
-                       if (iv->iov_len > size) {
-                               cio->cui_iov_olen = iv->iov_len;
-                               iv->iov_len = size;
-                       }
-                       break;
-               }
-       }
- 
-       cio->cui_nrsegs = i + 1;
-       LASSERTF(cio->cui_tot_nrsegs >= cio->cui_nrsegs,
-                "tot_nrsegs: %lu, nrsegs: %lu\n",
-                cio->cui_tot_nrsegs, cio->cui_nrsegs);
+       iov_iter_truncate(cio->cui_iter, size);
   }
   
   int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
@@@ -775,30 -757,7 +756,7 @@@ void ccc_io_advance(const struct lu_en
         if (!cl_is_normalio(env, io))
                 return;
   
-       LASSERT(cio->cui_tot_nrsegs >= cio->cui_nrsegs);
-       LASSERT(cio->cui_tot_count  >= nob);
- 
-       cio->cui_iov    += cio->cui_nrsegs;
-       cio->cui_tot_nrsegs -= cio->cui_nrsegs;
-       cio->cui_tot_count  -= nob;
- 
-       /* update the iov */
-       if (cio->cui_iov_olen > 0) {
-               struct iovec *iv;
- 
-               cio->cui_iov--;
-               cio->cui_tot_nrsegs++;
-               iv = &cio->cui_iov[0];
-               if (io->ci_continue) {
-                       iv->iov_base += iv->iov_len;
-                       LASSERT(cio->cui_iov_olen > iv->iov_len);
-                       iv->iov_len = cio->cui_iov_olen - iv->iov_len;
-               } else {
-                       /* restore the iov_len, in case of restart io. */
-                       iv->iov_len = cio->cui_iov_olen;
-               }
-               cio->cui_iov_olen = 0;
-       }
+       iov_iter_reexpand(cio->cui_iter, cio->cui_tot_count  -= nob);
   }
   
   /**
@@@ -962,7 -921,7 +920,7 @@@ void ccc_req_attr_set(const struct lu_e
                JOBSTATS_JOBID_SIZE);
   }
   
- -const struct cl_req_operations ccc_req_ops = {
+ +static const struct cl_req_operations ccc_req_ops = {
         .cro_attr_set   = ccc_req_attr_set,
         .cro_completion = ccc_req_completion
   };
diff --combined drivers/staging/lustre/lustre/llite/file.c

index c4ddec2b3589eb743475f022c29d47ce49dbbec8,3efda2540d295fb41595847419165811ee431652..716e1ee0104f6fe0c2c1323689f216acff807bdc
--- 1/drivers/staging/lustre/lustre/llite/file.c
--- 2/drivers/staging/lustre/lustre/llite/file.c
+++ b/drivers/staging/lustre/lustre/llite/file.c
@@@ -50,21 -50,11 +50,21 @@@
   
   #include "cl_object.h"
   
- -struct ll_file_data *ll_file_data_get(void)
+ +static int
+ +ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
+ +
+ +static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
+ +                        bool *lease_broken);
+ +
+ +static enum llioc_iter
+ +ll_iocontrol_call(struct inode *inode, struct file *file,
+ +                unsigned int cmd, unsigned long arg, int *rcp);
+ +
+ +static struct ll_file_data *ll_file_data_get(void)
   {
         struct ll_file_data *fd;
   
- -      OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
+ +      OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
         if (fd == NULL)
                 return NULL;
         fd->fd_write_failed = false;
@@@ -257,8 -247,8 +257,8 @@@ int ll_md_real_close(struct inode *inod
         return rc;
   }
   
- -int ll_md_close(struct obd_export *md_exp, struct inode *inode,
- -              struct file *file)
+ +static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
+ +                     struct file *file)
   {
         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
         struct ll_inode_info *lli = ll_i2info(inode);
@@@ -492,8 -482,8 +492,8 @@@ static int ll_och_fill(struct obd_expor
         return md_set_open_replay_data(md_exp, och, it);
   }
   
- -int ll_local_open(struct file *file, struct lookup_intent *it,
- -                struct ll_file_data *fd, struct obd_client_handle *och)
+ +static int ll_local_open(struct file *file, struct lookup_intent *it,
+ +                       struct ll_file_data *fd, struct obd_client_handle *och)
   {
         struct inode *inode = file->f_dentry->d_inode;
         struct ll_inode_info *lli = ll_i2info(inode);
@@@ -743,9 -733,8 +743,9 @@@ static int ll_md_blocking_lease_ast(str
   /**
    * Acquire a lease and open the file.
    */
- -struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
- -                                      fmode_t fmode, __u64 open_flags)
+ +static struct obd_client_handle *
+ +ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
+ +            __u64 open_flags)
   {
         struct lookup_intent it = { .it_op = IT_OPEN };
         struct ll_sb_info *sbi = ll_i2sbi(inode);
@@@ -873,13 -862,14 +873,13 @@@ out
         OBD_FREE_PTR(och);
         return ERR_PTR(rc);
   }
- -EXPORT_SYMBOL(ll_lease_open);
   
   /**
    * Release lease and close the file.
    * It will check if the lease has ever broken.
    */
- -int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
- -                      bool *lease_broken)
+ +static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
+ +                        bool *lease_broken)
   {
         struct ldlm_lock *lock;
         bool cancelled = true;
@@@ -905,6 -895,7 +905,6 @@@
                                        NULL);
         return rc;
   }
- -EXPORT_SYMBOL(ll_lease_close);
   
   /* Fills the obdo with the attributes for the lsm */
   static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
@@@ -1114,9 -1105,7 +1114,7 @@@ restart
   
                 switch (vio->cui_io_subtype) {
                 case IO_NORMAL:
-                       cio->cui_iov = args->u.normal.via_iov;
-                       cio->cui_nrsegs = args->u.normal.via_nrsegs;
-                       cio->cui_tot_nrsegs = cio->cui_nrsegs;
+                       cio->cui_iter = args->u.normal.via_iter;
                         cio->cui_iocb = args->u.normal.via_iocb;
                         if ((iot == CIT_WRITE) &&
                             !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
@@@ -1180,58 -1169,23 +1178,23 @@@ out
         return result;
   }
   
- static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-                               unsigned long nr_segs, loff_t pos)
+ static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
   {
         struct lu_env      *env;
         struct vvp_io_args *args;
-       size_t        count = 0;
         ssize_t      result;
         int              refcheck;
   
-       result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-       if (result)
-               return result;
- 
         env = cl_env_get(&refcheck);
         if (IS_ERR(env))
                 return PTR_ERR(env);
   
         args = vvp_env_args(env, IO_NORMAL);
-       args->u.normal.via_iov = (struct iovec *)iov;
-       args->u.normal.via_nrsegs = nr_segs;
+       args->u.normal.via_iter = to;
         args->u.normal.via_iocb = iocb;
   
         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
-                                   &iocb->ki_pos, count);
-       cl_env_put(env, &refcheck);
-       return result;
- }
- 
- static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
-                           loff_t *ppos)
- {
-       struct lu_env *env;
-       struct iovec  *local_iov;
-       struct kiocb  *kiocb;
-       ssize_t result;
-       int         refcheck;
- 
-       env = cl_env_get(&refcheck);
-       if (IS_ERR(env))
-               return PTR_ERR(env);
- 
-       local_iov = &vvp_env_info(env)->vti_local_iov;
-       kiocb = &vvp_env_info(env)->vti_kiocb;
-       local_iov->iov_base = (void __user *)buf;
-       local_iov->iov_len = count;
-       init_sync_kiocb(kiocb, file);
-       kiocb->ki_pos = *ppos;
-       kiocb->ki_nbytes = count;
- 
-       result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
-       *ppos = kiocb->ki_pos;
- 
+                                   &iocb->ki_pos, iov_iter_count(to));
         cl_env_put(env, &refcheck);
         return result;
   }
@@@ -1239,64 -1193,27 +1202,27 @@@
   /*
    * Write to a file (through the page cache).
    */
- static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                                unsigned long nr_segs, loff_t pos)
+ static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
   {
         struct lu_env      *env;
         struct vvp_io_args *args;
-       size_t        count = 0;
         ssize_t      result;
         int              refcheck;
   
-       result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
-       if (result)
-               return result;
- 
         env = cl_env_get(&refcheck);
         if (IS_ERR(env))
                 return PTR_ERR(env);
   
         args = vvp_env_args(env, IO_NORMAL);
-       args->u.normal.via_iov = (struct iovec *)iov;
-       args->u.normal.via_nrsegs = nr_segs;
+       args->u.normal.via_iter = from;
         args->u.normal.via_iocb = iocb;
   
         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
-                                 &iocb->ki_pos, count);
+                                 &iocb->ki_pos, iov_iter_count(from));
         cl_env_put(env, &refcheck);
         return result;
   }
   
- static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
-                            loff_t *ppos)
- {
-       struct lu_env *env;
-       struct iovec  *local_iov;
-       struct kiocb  *kiocb;
-       ssize_t result;
-       int         refcheck;
- 
-       env = cl_env_get(&refcheck);
-       if (IS_ERR(env))
-               return PTR_ERR(env);
- 
-       local_iov = &vvp_env_info(env)->vti_local_iov;
-       kiocb = &vvp_env_info(env)->vti_kiocb;
-       local_iov->iov_base = (void __user *)buf;
-       local_iov->iov_len = count;
-       init_sync_kiocb(kiocb, file);
-       kiocb->ki_pos = *ppos;
-       kiocb->ki_nbytes = count;
- 
-       result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
-       *ppos = kiocb->ki_pos;
- 
-       cl_env_put(env, &refcheck);
-       return result;
- }
- 
- 
- 
   /*
    * Send file content (through pagecache) somewhere with helper
    */
@@@ -1449,7 -1366,7 +1375,7 @@@ int ll_lov_getstripe_ea_info(struct ino
         struct md_op_data *op_data;
         int rc, lmmsize;
   
- -      rc = ll_get_max_mdsize(sbi, &lmmsize);
+ +      rc = ll_get_default_mdsize(sbi, &lmmsize);
         if (rc)
                 return rc;
   
@@@ -1599,8 -1516,7 +1525,8 @@@ static int ll_lov_getstripe(struct inod
         return rc;
   }
   
- -int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
+ +static int
+ +ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
   {
         struct ll_inode_info   *lli = ll_i2info(inode);
         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
@@@ -1720,13 -1636,13 +1646,13 @@@ out
    * Get size for inode for which FIEMAP mapping is requested.
    * Make the FIEMAP get_info call and returns the result.
    */
- -int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
- -            int num_bytes)
+ +static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
+ +                      size_t num_bytes)
   {
         struct obd_export *exp = ll_i2dtexp(inode);
         struct lov_stripe_md *lsm = NULL;
         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
- -      int vallen = num_bytes;
+ +      __u32 vallen = num_bytes;
         int rc;
   
         /* Checks for fiemap flags */
@@@ -1829,10 -1745,6 +1755,10 @@@ static int ll_ioctl_fiemap(struct inod
         if (get_user(extent_count,
             &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
                 return -EFAULT;
+ +
+ +      if (extent_count >=
+ +          (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
+ +              return -EINVAL;
         num_bytes = sizeof(*fiemap_s) + (extent_count *
                                          sizeof(struct ll_fiemap_extent));
   
@@@ -2204,8 -2116,7 +2130,8 @@@ out
         return rc;
   }
   
- -long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+ +static long
+ +ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
   {
         struct inode            *inode = file->f_dentry->d_inode;
         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
@@@ -2524,7 -2435,7 +2450,7 @@@
   }
   
   
- -loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
+ +static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
   {
         struct inode *inode = file->f_dentry->d_inode;
         loff_t retval, eof = 0;
@@@ -2548,7 -2459,7 +2474,7 @@@
         return retval;
   }
   
- -int ll_flush(struct file *file, fl_owner_t id)
+ +static int ll_flush(struct file *file, fl_owner_t id)
   {
         struct inode *inode = file->f_dentry->d_inode;
         struct ll_inode_info *lli = ll_i2info(inode);
@@@ -2574,7 -2485,7 +2500,7 @@@
   
   /**
    * Called to make sure a portion of file has been written out.
- - * if @local_only is not true, it will send OST_SYNC RPCs to ost.
+ + * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
    *
    * Return how many pages have been written.
    */
@@@ -2668,10 -2579,11 +2594,10 @@@ int ll_fsync(struct file *file, loff_t 
         if (!err)
                 ptlrpc_req_finished(req);
   
- -      if (datasync && S_ISREG(inode->i_mode)) {
+ +      if (S_ISREG(inode->i_mode)) {
                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
   
- -              err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
- -                              CL_FSYNC_ALL, 0);
+ +              err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
                 if (rc == 0 && err < 0)
                         rc = err;
                 if (rc < 0)
@@@ -2684,8 -2596,7 +2610,8 @@@
         return rc;
   }
   
- -int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
+ +static int
+ +ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
   {
         struct inode *inode = file->f_dentry->d_inode;
         struct ll_sb_info *sbi = ll_i2sbi(inode);
@@@ -2706,15 -2617,20 +2632,15 @@@
   
         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
   
- -      if (file_lock->fl_flags & FL_FLOCK) {
+ +      if (file_lock->fl_flags & FL_FLOCK)
                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
- -              /* flocks are whole-file locks */
- -              flock.l_flock.end = OFFSET_MAX;
- -              /* For flocks owner is determined by the local file descriptor*/
- -              flock.l_flock.owner = (unsigned long)file_lock->fl_file;
- -      } else if (file_lock->fl_flags & FL_POSIX) {
- -              flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
- -              flock.l_flock.start = file_lock->fl_start;
- -              flock.l_flock.end = file_lock->fl_end;
- -      } else {
+ +      else if (!(file_lock->fl_flags & FL_POSIX))
                 return -EINVAL;
- -      }
+ +
+ +      flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
         flock.l_flock.pid = file_lock->fl_pid;
+ +      flock.l_flock.start = file_lock->fl_start;
+ +      flock.l_flock.end = file_lock->fl_end;
   
         /* Somewhat ugly workaround for svc lockd.
          * lockd installs custom fl_lmops->lm_compare_owner that checks
@@@ -2809,8 -2725,7 +2735,8 @@@
         return rc;
   }
   
- -int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
+ +static int
+ +ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
   {
         return -ENOSYS;
   }
@@@ -2893,16 -2808,16 +2819,16 @@@ static int ll_inode_revalidate_fini(str
                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
                         return 0;
         } else if (rc != 0) {
- -              CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
- -                     ll_get_fsname(inode->i_sb, NULL, 0),
- -                     PFID(ll_inode2fid(inode)), rc);
+ +              CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
+ +                           "%s: revalidate FID "DFID" error: rc = %d\n",
+ +                           ll_get_fsname(inode->i_sb, NULL, 0),
+ +                           PFID(ll_inode2fid(inode)), rc);
         }
   
         return rc;
   }
   
- -int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
- -                           __u64 ibits)
+ +static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
   {
         struct inode *inode = dentry->d_inode;
         struct ptlrpc_request *req = NULL;
@@@ -2967,7 -2882,7 +2893,7 @@@
                 int ealen = 0;
   
                 if (S_ISREG(inode->i_mode)) {
- -                      rc = ll_get_max_mdsize(sbi, &ealen);
+ +                      rc = ll_get_default_mdsize(sbi, &ealen);
                         if (rc)
                                 return rc;
                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
@@@ -2997,12 -2912,13 +2923,12 @@@ out
         return rc;
   }
   
- -int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
- -                         __u64 ibits)
+ +static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
   {
         struct inode *inode = dentry->d_inode;
         int rc;
   
- -      rc = __ll_inode_revalidate_it(dentry, it, ibits);
+ +      rc = __ll_inode_revalidate(dentry, ibits);
         if (rc != 0)
                 return rc;
   
@@@ -3025,15 -2941,16 +2951,15 @@@
         return rc;
   }
   
- -int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
- -                struct lookup_intent *it, struct kstat *stat)
+ +int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
   {
         struct inode *inode = de->d_inode;
         struct ll_sb_info *sbi = ll_i2sbi(inode);
         struct ll_inode_info *lli = ll_i2info(inode);
         int res = 0;
   
- -      res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
- -                                           MDS_INODELOCK_LOOKUP);
+ +      res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
+ +                                    MDS_INODELOCK_LOOKUP);
         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
   
         if (res)
@@@ -3059,9 -2976,15 +2985,9 @@@
   
         return 0;
   }
- -int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
- -{
- -      struct lookup_intent it = { .it_op = IT_GETATTR };
- -
- -      return ll_getattr_it(mnt, de, &it, stat);
- -}
   
- -int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- -              __u64 start, __u64 len)
+ +static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ +                   __u64 start, __u64 len)
   {
         int rc;
         size_t num_bytes;
@@@ -3079,24 -3002,21 +3005,24 @@@
         fiemap->fm_extent_count = fieinfo->fi_extents_max;
         fiemap->fm_start = start;
         fiemap->fm_length = len;
- -      memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
- -             sizeof(struct ll_fiemap_extent));
+ +      if (extent_count > 0)
+ +              memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
+ +                     sizeof(struct ll_fiemap_extent));
   
         rc = ll_do_fiemap(inode, fiemap, num_bytes);
   
         fieinfo->fi_flags = fiemap->fm_flags;
         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
- -      memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
- -             fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
+ +      if (extent_count > 0)
+ +              memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
+ +                     fiemap->fm_mapped_extents *
+ +                     sizeof(struct ll_fiemap_extent));
   
         OBD_FREE_LARGE(fiemap, num_bytes);
         return rc;
   }
   
- -struct posix_acl * ll_get_acl(struct inode *inode, int type)
+ +struct posix_acl *ll_get_acl(struct inode *inode, int type)
   {
         struct ll_inode_info *lli = ll_i2info(inode);
         struct posix_acl *acl = NULL;
@@@ -3123,8 -3043,10 +3049,8 @@@ int ll_inode_permission(struct inode *i
         * need to do it before permission check. */
   
         if (inode == inode->i_sb->s_root->d_inode) {
- -              struct lookup_intent it = { .it_op = IT_LOOKUP };
- -
- -              rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
- -                                            MDS_INODELOCK_LOOKUP);
+ +              rc = __ll_inode_revalidate(inode->i_sb->s_root,
+ +                                         MDS_INODELOCK_LOOKUP);
                 if (rc)
                         return rc;
         }
@@@ -3143,10 -3065,10 +3069,10 @@@
   
   /* -o localflock - only provides locally consistent flock locks */
   struct file_operations ll_file_operations = {
-       .read      = ll_file_read,
-       .aio_read = ll_file_aio_read,
-       .write    = ll_file_write,
-       .aio_write = ll_file_aio_write,
+       .read      = new_sync_read,
+       .read_iter = ll_file_read_iter,
+       .write    = new_sync_write,
+       .write_iter = ll_file_write_iter,
         .unlocked_ioctl = ll_file_ioctl,
         .open      = ll_file_open,
         .release        = ll_file_release,
@@@ -3158,10 -3080,10 +3084,10 @@@
   };
   
   struct file_operations ll_file_operations_flock = {
-       .read      = ll_file_read,
-       .aio_read    = ll_file_aio_read,
-       .write    = ll_file_write,
-       .aio_write   = ll_file_aio_write,
+       .read      = new_sync_read,
+       .read_iter    = ll_file_read_iter,
+       .write    = new_sync_write,
+       .write_iter   = ll_file_write_iter,
         .unlocked_ioctl = ll_file_ioctl,
         .open      = ll_file_open,
         .release        = ll_file_release,
@@@ -3176,10 -3098,10 +3102,10 @@@
   
   /* These are for -o noflock - to return ENOSYS on flock calls */
   struct file_operations ll_file_operations_noflock = {
-       .read      = ll_file_read,
-       .aio_read    = ll_file_aio_read,
-       .write    = ll_file_write,
-       .aio_write   = ll_file_aio_write,
+       .read      = new_sync_read,
+       .read_iter    = ll_file_read_iter,
+       .write    = new_sync_write,
+       .write_iter   = ll_file_write_iter,
         .unlocked_ioctl = ll_file_ioctl,
         .open      = ll_file_open,
         .release        = ll_file_release,
@@@ -3276,9 -3198,8 +3202,9 @@@ void ll_iocontrol_unregister(void *magi
   EXPORT_SYMBOL(ll_iocontrol_register);
   EXPORT_SYMBOL(ll_iocontrol_unregister);
   
- -enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
- -                      unsigned int cmd, unsigned long arg, int *rcp)
+ +static enum llioc_iter
+ +ll_iocontrol_call(struct inode *inode, struct file *file,
+ +                unsigned int cmd, unsigned long arg, int *rcp)
   {
         enum llioc_iter ret = LLIOC_CONT;
         struct llioc_data *data;
@@@ -3363,7 -3284,7 +3289,7 @@@ static int ll_layout_fetch(struct inod
          * layout here. Please note that we can't use the LVB buffer in
          * completion AST because it doesn't have a large enough buffer */
         oc = ll_mdscapa_get(inode);
- -      rc = ll_get_max_mdsize(sbi, &lmmsize);
+ +      rc = ll_get_default_mdsize(sbi, &lmmsize);
         if (rc == 0)
                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
@@@ -3373,7 -3294,7 +3299,7 @@@
                 return rc;
   
         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
- -      if (body == NULL || body->eadatasize > lmmsize)
+ +      if (body == NULL)
                 GOTO(out, rc = -EPROTO);
   
         lmmsize = body->eadatasize;
@@@ -3440,7 -3361,7 +3366,7 @@@ static int ll_layout_lock_set(struct lu
                 if (lvb_ready) {
                         /* layout_gen must be valid if layout lock is not
                          * cancelled and stripe has already set */
- -                      *gen = lli->lli_layout_gen;
+ +                      *gen = ll_layout_version_get(lli);
                         rc = 0;
                 }
                 GOTO(out, rc);
@@@ -3538,20 -3459,32 +3464,20 @@@ int ll_layout_refresh(struct inode *ino
         };
         int rc;
   
- -      *gen = lli->lli_layout_gen;
- -      if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
+ +      *gen = ll_layout_version_get(lli);
+ +      if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
                 return 0;
   
         /* sanity checks */
         LASSERT(fid_is_sane(ll_inode2fid(inode)));
         LASSERT(S_ISREG(inode->i_mode));
   
- -      /* mostly layout lock is caching on the local side, so try to match
- -       * it before grabbing layout lock mutex. */
- -      mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
- -                             LCK_CR | LCK_CW | LCK_PR | LCK_PW);
- -      if (mode != 0) { /* hit cached lock */
- -              rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
- -              if (rc == 0)
- -                      return 0;
- -
- -              /* better hold lli_layout_mutex to try again otherwise
- -               * it will have starvation problem. */
- -      }
- -
         /* take layout lock mutex to enqueue layout lock exclusively. */
         mutex_lock(&lli->lli_layout_mutex);
   
   again:
- -      /* try again. Maybe somebody else has done this. */
+ +      /* mostly layout lock is caching on the local side, so try to match
+ +       * it before grabbing layout lock mutex. */
         mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
                                LCK_CR | LCK_CW | LCK_PR | LCK_PW);
         if (mode != 0) { /* hit cached lock */
diff --combined drivers/staging/lustre/lustre/llite/llite_internal.h

index dde7632ba01fa8dab671fb43995723bba1f5d3dd,fbb8650ead346885c4f0dd14b5e3bdc7ead493b7..140ee947ba4949ea547ac03ebaf9a9efb9e51ab1
--- 1/drivers/staging/lustre/lustre/llite/llite_internal.h
--- 2/drivers/staging/lustre/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@@ -71,6 -71,10 +71,6 @@@
   #define LUSTRE_FPRIVATE(file) ((file)->private_data)
   
   struct ll_dentry_data {
- -      int                             lld_cwd_count;
- -      int                             lld_mnt_count;
- -      struct obd_client_handle        lld_cwd_och;
- -      struct obd_client_handle        lld_mnt_och;
         struct lookup_intent            *lld_it;
         unsigned int                    lld_sa_generation;
         unsigned int                    lld_invalid:1;
@@@ -79,6 -83,8 +79,6 @@@
   
   #define ll_d2d(de) ((struct ll_dentry_data*)((de)->d_fsdata))
   
- -extern struct file_operations ll_pgcache_seq_fops;
- -
   #define LLI_INODE_MAGIC                0x111d0de5
   #define LLI_INODE_DEAD                  0xdeadd00d
   
@@@ -114,12 -120,16 +114,12 @@@ enum lli_flags 
         /* Sizeon-on-MDS attributes are changed. An attribute update needs to
          * be sent to MDS. */
         LLIF_SOM_DIRTY    = (1 << 3),
- -      /* File is contented */
- -      LLIF_CONTENDED    = (1 << 4),
- -      /* Truncate uses server lock for this file */
- -      LLIF_SRVLOCK        = (1 << 5),
         /* File data is modified. */
- -      LLIF_DATA_MODIFIED      = (1 << 6),
+ +      LLIF_DATA_MODIFIED      = (1 << 4),
         /* File is being restored */
- -      LLIF_FILE_RESTORING     = (1 << 7),
+ +      LLIF_FILE_RESTORING     = (1 << 5),
         /* Xattr cache is attached to the file */
- -      LLIF_XATTR_CACHE        = (1 << 8),
+ +      LLIF_XATTR_CACHE        = (1 << 6),
   };
   
   struct ll_inode_info {
@@@ -184,6 -194,7 +184,6 @@@
                          * cleanup the dir readahead. */
                         void                       *d_opendir_key;
                         struct ll_statahead_info       *d_sai;
- -                      struct posix_acl               *d_def_acl;
                         /* protect statahead stuff. */
                         spinlock_t                      d_sa_lock;
                         /* "opendir_pid" is the token when lookup/revalid
@@@ -194,12 -205,14 +194,12 @@@
   #define lli_readdir_mutex       u.d.d_readdir_mutex
   #define lli_opendir_key        u.d.d_opendir_key
   #define lli_sai                u.d.d_sai
- -#define lli_def_acl        u.d.d_def_acl
   #define lli_sa_lock        u.d.d_sa_lock
   #define lli_opendir_pid        u.d.d_opendir_pid
   
                 /* for non-directory */
                 struct {
- -                      struct semaphore                f_size_sem;
- -                      void                            *f_size_sem_owner;
+ +                      struct mutex                    f_size_mutex;
                         char                            *f_symlink_name;
                         __u64                           f_maxbytes;
                         /*
@@@ -220,6 -233,11 +220,6 @@@
                         /* for writepage() only to communicate to fsync */
                         int                             f_async_rc;
   
- -                      /* volatile file criteria is based on file name, this
- -                       * flag is used to keep the test result, so the strcmp
- -                       * is done only once
- -                       */
- -                      bool                            f_volatile;
                         /*
                          * whenever a process try to read/write the file, the
                          * jobid of the process will be saved here, and it'll
@@@ -231,7 -249,8 +231,7 @@@
                         char                 f_jobid[JOBSTATS_JOBID_SIZE];
                 } f;
   
- -#define lli_size_sem      u.f.f_size_sem
- -#define lli_size_sem_owner      u.f.f_size_sem_owner
+ +#define lli_size_mutex          u.f.f_size_mutex
   #define lli_symlink_name      u.f.f_symlink_name
   #define lli_maxbytes      u.f.f_maxbytes
   #define lli_trunc_sem    u.f.f_trunc_sem
@@@ -242,6 -261,7 +242,6 @@@
   #define lli_agl_index         u.f.f_agl_index
   #define lli_async_rc          u.f.f_async_rc
   #define lli_jobid             u.f.f_jobid
- -#define lli_volatile          u.f.f_volatile
   
         } u;
   
@@@ -260,33 -280,14 +260,33 @@@
   
         /* mutex to request for layout lock exclusively. */
         struct mutex                    lli_layout_mutex;
- -      /* valid only inside LAYOUT ibits lock, protected by lli_layout_mutex */
+ +      /* Layout version, protected by lli_layout_lock */
         __u32                           lli_layout_gen;
+ +      spinlock_t                      lli_layout_lock;
   
         struct rw_semaphore             lli_xattrs_list_rwsem;
         struct mutex                    lli_xattrs_enq_lock;
         struct list_head                lli_xattrs;/* ll_xattr_entry->xe_list */
   };
   
+ +static inline __u32 ll_layout_version_get(struct ll_inode_info *lli)
+ +{
+ +      __u32 gen;
+ +
+ +      spin_lock(&lli->lli_layout_lock);
+ +      gen = lli->lli_layout_gen;
+ +      spin_unlock(&lli->lli_layout_lock);
+ +
+ +      return gen;
+ +}
+ +
+ +static inline void ll_layout_version_set(struct ll_inode_info *lli, __u32 gen)
+ +{
+ +      spin_lock(&lli->lli_layout_lock);
+ +      lli->lli_layout_gen = gen;
+ +      spin_unlock(&lli->lli_layout_lock);
+ +}
+ +
   int ll_xattr_cache_destroy(struct inode *inode);
   
   int ll_xattr_cache_get(struct inode *inode,
@@@ -299,7 -300,7 +299,7 @@@
    * Locking to guarantee consistency of non-atomic updates to long long i_size,
    * consistency between file size and KMS.
    *
- - * Implemented by ->lli_size_sem and ->lsm_lock, nested in that order.
+ + * Implemented by ->lli_size_mutex and ->lsm_lock, nested in that order.
    */
   
   void ll_inode_size_lock(struct inode *inode);
@@@ -441,6 -442,10 +441,6 @@@ enum stats_track_type 
         "xattr",        \
   }
   
- -/* default value for ll_sb_info->contention_time */
- -#define SBI_DEFAULT_CONTENTION_SECONDS     60
- -/* default value for lockless_truncate_enable */
- -#define SBI_DEFAULT_LOCKLESS_TRUNCATE_ENABLE 1
   #define RCE_HASHES      32
   
   struct rmtacl_ctl_entry {
@@@ -651,6 -656,12 +651,6 @@@ static inline struct inode *ll_info2i(s
         return &lli->lli_vfs_inode;
   }
   
- -struct it_cb_data {
- -      struct inode  *icbd_parent;
- -      struct dentry **icbd_childp;
- -      obd_id  hash;
- -};
- -
   __u32 ll_i2suppgid(struct inode *i);
   void ll_i2gids(__u32 *suppgids, struct inode *i1,struct inode *i2);
   
@@@ -658,13 -669,21 +658,13 @@@ static inline int ll_need_32bit_api(str
   {
   #if BITS_PER_LONG == 32
         return 1;
+ +#elif defined(CONFIG_COMPAT)
+ +      return unlikely(is_compat_task() || (sbi->ll_flags & LL_SBI_32BIT_API));
   #else
- -      return unlikely(
- -#ifdef CONFIG_COMPAT
- -              is_compat_task() ||
- -#endif
- -              (sbi->ll_flags & LL_SBI_32BIT_API)
- -      );
+ +      return unlikely(sbi->ll_flags & LL_SBI_32BIT_API);
   #endif
   }
   
- -#define LLAP_MAGIC 98764321
- -
- -extern struct kmem_cache *ll_async_page_slab;
- -extern size_t ll_async_page_slab_size;
- -
   void ll_ra_read_in(struct file *f, struct ll_ra_read *rar);
   void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar);
   struct ll_ra_read *ll_ra_read_get(struct file *f);
@@@ -697,16 -716,14 +697,16 @@@ static inline void ll_rw_stats_tally(st
   
   /* llite/dir.c */
   void ll_release_page(struct page *page, int remove);
- -extern struct file_operations ll_dir_operations;
- -extern struct inode_operations ll_dir_inode_operations;
+ +extern const struct file_operations ll_dir_operations;
+ +extern const struct inode_operations ll_dir_inode_operations;
   struct page *ll_get_dir_page(struct inode *dir, __u64 hash,
                              struct ll_dir_chain *chain);
   int ll_dir_read(struct inode *inode, struct dir_context *ctx);
   
   int ll_get_mdt_idx(struct inode *inode);
   /* llite/namei.c */
+ +extern const struct inode_operations ll_special_inode_operations;
+ +
   int ll_objects_destroy(struct ptlrpc_request *request,
                        struct inode *dir);
   struct inode *ll_iget(struct super_block *sb, ino_t hash,
@@@ -721,34 -738,43 +721,34 @@@ int ll_prepare_write(struct file *, str
   int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to);
   int ll_writepage(struct page *page, struct writeback_control *wbc);
   int ll_writepages(struct address_space *, struct writeback_control *wbc);
- -void ll_removepage(struct page *page);
   int ll_readpage(struct file *file, struct page *page);
   void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
- -int ll_file_punch(struct inode *, loff_t, int);
- -ssize_t ll_file_lockless_io(struct file *, char *, size_t, loff_t *, int);
- -void ll_clear_file_contended(struct inode*);
- -int ll_sync_page_range(struct inode *, struct address_space *, loff_t, size_t);
   int ll_readahead(const struct lu_env *env, struct cl_io *io,
                  struct ll_readahead_state *ras, struct address_space *mapping,
                  struct cl_page_list *queue, int flags);
   
+ +#ifndef MS_HAS_NEW_AOPS
+ +extern const struct address_space_operations ll_aops;
+ +#else
+ +extern const struct address_space_operations_ext ll_aops;
+ +#endif
+ +
   /* llite/file.c */
   extern struct file_operations ll_file_operations;
   extern struct file_operations ll_file_operations_flock;
   extern struct file_operations ll_file_operations_noflock;
   extern struct inode_operations ll_file_inode_operations;
- -extern int ll_inode_revalidate_it(struct dentry *, struct lookup_intent *,
- -                                __u64);
   extern int ll_have_md_lock(struct inode *inode, __u64 *bits,
                            ldlm_mode_t l_req_mode);
   extern ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
                                    struct lustre_handle *lockh, __u64 flags,
                                    ldlm_mode_t mode);
- -int __ll_inode_revalidate_it(struct dentry *, struct lookup_intent *,
- -                           __u64 bits);
- -int ll_revalidate_nd(struct dentry *dentry, unsigned int flags);
   int ll_file_open(struct inode *inode, struct file *file);
   int ll_file_release(struct inode *inode, struct file *file);
   int ll_glimpse_ioctl(struct ll_sb_info *sbi,
                      struct lov_stripe_md *lsm, lstat_t *st);
   void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch);
- -int ll_local_open(struct file *file,
- -                struct lookup_intent *it, struct ll_file_data *fd,
- -                struct obd_client_handle *och);
   int ll_release_openhandle(struct dentry *, struct lookup_intent *);
- -int ll_md_close(struct obd_export *md_exp, struct inode *inode,
- -              struct file *file);
   int ll_md_real_close(struct inode *inode, fmode_t fmode);
   void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data,
                       struct obd_client_handle **och, unsigned long flags);
@@@ -756,10 -782,15 +756,10 @@@ void ll_done_writing_attr(struct inode 
   int ll_som_update(struct inode *inode, struct md_op_data *op_data);
   int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
                      __u64 ioepoch, int sync);
- -int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data,
- -                struct md_open_data **mod);
   void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
                           struct lustre_handle *fh);
- -int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
- -             struct lookup_intent *it, struct kstat *stat);
   int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
- -struct ll_file_data *ll_file_data_get(void);
- -struct posix_acl * ll_get_acl(struct inode *inode, int type);
+ +struct posix_acl *ll_get_acl(struct inode *inode, int type);
   
   int ll_inode_permission(struct inode *inode, int mask);
   
@@@ -774,30 -805,44 +774,30 @@@ int ll_dir_setstripe(struct inode *inod
   int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
                      int *lmm_size, struct ptlrpc_request **request);
   int ll_fsync(struct file *file, loff_t start, loff_t end, int data);
- -int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
- -            int num_bytes);
   int ll_merge_lvb(const struct lu_env *env, struct inode *inode);
- -int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg);
- -int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
   int ll_fid2path(struct inode *inode, void *arg);
   int ll_data_version(struct inode *inode, __u64 *data_version, int extent_lock);
   int ll_hsm_release(struct inode *inode);
   
- -struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
- -                                      fmode_t mode, __u64 flags);
- -int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
- -                 bool *lease_broken);
- -
   /* llite/dcache.c */
   
   int ll_d_init(struct dentry *de);
- -extern struct dentry_operations ll_d_ops;
+ +extern const struct dentry_operations ll_d_ops;
   void ll_intent_drop_lock(struct lookup_intent *);
   void ll_intent_release(struct lookup_intent *);
   void ll_invalidate_aliases(struct inode *);
- -void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft);
   void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry);
- -int ll_dcompare(const struct dentry *parent, const struct dentry *dentry,
- -              unsigned int len, const char *str, const struct qstr *d_name);
   int ll_revalidate_it_finish(struct ptlrpc_request *request,
                             struct lookup_intent *it, struct dentry *de);
   
   /* llite/llite_lib.c */
   extern struct super_operations lustre_super_operations;
   
- -char *ll_read_opt(const char *opt, char *data);
   void ll_lli_init(struct ll_inode_info *lli);
   int ll_fill_super(struct super_block *sb, struct vfsmount *mnt);
   void ll_put_super(struct super_block *sb);
   void ll_kill_super(struct super_block *sb);
   struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock);
- -struct inode *ll_inode_from_lock(struct ldlm_lock *lock);
   void ll_clear_inode(struct inode *inode);
   int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import);
   int ll_setattr(struct dentry *de, struct iattr *attr);
@@@ -817,11 -862,9 +817,11 @@@ void ll_dirty_page_discard_warn(struct 
   int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
                   struct super_block *, struct lookup_intent *);
   void lustre_dump_dentry(struct dentry *, int recur);
- -void lustre_dump_inode(struct inode *);
   int ll_obd_statfs(struct inode *inode, void *arg);
   int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
+ +int ll_get_default_mdsize(struct ll_sb_info *sbi, int *default_mdsize);
+ +int ll_get_max_cookiesize(struct ll_sb_info *sbi, int *max_cookiesize);
+ +int ll_get_default_cookiesize(struct ll_sb_info *sbi, int *default_cookiesize);
   int ll_process_config(struct lustre_cfg *lcfg);
   struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
                                       struct inode *i1, struct inode *i2,
@@@ -838,6 -881,15 +838,6 @@@ void get_uuid2fsid(const char *name, in
   struct inode *search_inode_for_lustre(struct super_block *sb,
                                       const struct lu_fid *fid);
   
- -/* llite/special.c */
- -extern struct inode_operations ll_special_inode_operations;
- -extern struct file_operations ll_special_chr_inode_fops;
- -extern struct file_operations ll_special_chr_file_fops;
- -extern struct file_operations ll_special_blk_inode_fops;
- -extern struct file_operations ll_special_fifo_inode_fops;
- -extern struct file_operations ll_special_fifo_file_fops;
- -extern struct file_operations ll_special_sock_inode_fops;
- -
   /* llite/symlink.c */
   extern struct inode_operations ll_fast_symlink_inode_operations;
   
@@@ -905,6 -957,11 +905,6 @@@ struct vvp_io 
          * Set when cui_bead has been initialized.
          */
         int               cui_ra_window_set;
- -      /**
- -       * Partially truncated page, that vvp_io_trunc_start() keeps locked
- -       * across truncate.
- -       */
- -      struct cl_page      *cui_partpage;
   };
   
   /**
@@@ -917,8 -974,7 +917,7 @@@ struct vvp_io_args 
         union {
                 struct {
                         struct kiocb      *via_iocb;
-                       struct iovec      *via_iov;
-                       unsigned long      via_nrsegs;
+                       struct iov_iter   *via_iter;
                 } normal;
                 struct {
                         struct pipe_inode_info  *via_pipe;
@@@ -933,9 -989,12 +932,9 @@@ struct ll_cl_context 
         struct cl_page *lcc_page;
         struct lu_env  *lcc_env;
         int          lcc_refcheck;
- -      int          lcc_created;
   };
   
   struct vvp_thread_info {
- -      struct ost_lvb       vti_lvb;
- -      struct cl_2queue     vti_queue;
         struct iovec     vti_local_iov;
         struct vvp_io_args   vti_args;
         struct ra_io_arg     vti_ria;
@@@ -982,17 -1041,25 +981,17 @@@ static inline struct vvp_io *vvp_env_io
         return &vvp_env_session(env)->vs_ios;
   }
   
+ +int vvp_global_init(void);
+ +void vvp_global_fini(void);
+ +
   void ll_queue_done_writing(struct inode *inode, unsigned long flags);
   void ll_close_thread_shutdown(struct ll_close_queue *lcq);
   int ll_close_thread_start(struct ll_close_queue **lcq_ret);
   
   /* llite/llite_mmap.c */
- -typedef struct rb_root  rb_root_t;
- -typedef struct rb_node  rb_node_t;
- -
- -struct ll_lock_tree_node;
- -struct ll_lock_tree {
- -      rb_root_t                      lt_root;
- -      struct list_head                      lt_locked_list;
- -      struct ll_file_data         *lt_fd;
- -};
   
   int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last);
   int ll_file_mmap(struct file * file, struct vm_area_struct * vma);
- -struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
- -                                            __u64 end, ldlm_mode_t mode);
   void policy_from_vma(ldlm_policy_data_t *policy,
                 struct vm_area_struct *vma, unsigned long addr, size_t count);
   struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
@@@ -1059,6 -1126,11 +1058,6 @@@ static inline struct lu_fid *ll_inode2f
         return fid;
   }
   
- -static inline int ll_mds_max_easize(struct super_block *sb)
- -{
- -      return sbi2mdc(ll_s2sbi(sb))->cl_max_mds_easize;
- -}
- -
   static inline __u64 ll_file_maxbytes(struct inode *inode)
   {
         return ll_i2info(inode)->lli_maxbytes;
@@@ -1076,6 -1148,7 +1075,6 @@@ int ll_removexattr(struct dentry *dentr
   extern struct kmem_cache *ll_remote_perm_cachep;
   extern struct kmem_cache *ll_rmtperm_hash_cachep;
   
- -struct hlist_head *alloc_rmtperm_hash(void);
   void free_rmtperm_hash(struct hlist_head *hash);
   int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm);
   int lustre_check_remote_perm(struct inode *inode, int mask);
@@@ -1088,6 -1161,7 +1087,6 @@@ void ll_capa_thread_stop(void)
   void ll_capa_timer_callback(unsigned long unused);
   
   struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa);
- -int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa);
   
   void ll_capa_open(struct inode *inode);
   void ll_capa_close(struct inode *inode);
@@@ -1107,12 -1181,14 +1106,12 @@@ extern struct lu_device_type vvp_device
    */
   int cl_sb_init(struct super_block *sb);
   int cl_sb_fini(struct super_block *sb);
- -enum cl_lock_mode  vvp_mode_from_vma(struct vm_area_struct *vma);
   void ll_io_init(struct cl_io *io, const struct file *file, int write);
   
   void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                 struct ll_readahead_state *ras, unsigned long index,
                 unsigned hit);
   void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len);
- -int ll_is_file_contended(struct file *file);
   void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which);
   
   /* llite/llite_rmtacl.c */
@@@ -1185,6 -1261,7 +1184,6 @@@ struct ll_statahead_info 
         unsigned int        sai_skip_hidden;/* skipped hidden dentry count */
         unsigned int        sai_ls_all:1,   /* "ls -al", do stat-ahead for
                                                  * hidden entries */
- -                              sai_in_readpage:1,/* statahead is in readdir()*/
                                 sai_agl_valid:1;/* AGL is valid for the dir */
         wait_queue_head_t            sai_waitq;      /* stat-ahead wait queue */
         struct ptlrpc_thread    sai_thread;     /* stat-ahead thread */
@@@ -1309,6 -1386,9 +1308,6 @@@ typedef enum llioc_iter (*llioc_callbac
                 struct file *file, unsigned int cmd, unsigned long arg,
                 void *magic, int *rcp);
   
- -enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
- -              unsigned int cmd, unsigned long arg, int *rcp);
- -
   /* export functions */
   /* Register ioctl block dynamatically for a regular file.
    *
@@@ -1350,7 -1430,7 +1349,7 @@@ static inline void cl_isize_unlock(stru
   
   static inline void cl_isize_write_nolock(struct inode *inode, loff_t kms)
   {
- -      LASSERT(down_trylock(&ll_i2info(inode)->lli_size_sem) != 0);
+ +      LASSERT(mutex_is_locked(&ll_i2info(inode)->lli_size_mutex));
         i_size_write(inode, kms);
   }
   
diff --combined drivers/staging/lustre/lustre/llite/rw.c

index f0122c568a099fbb2ea519ddc4bca5d97a34495a,b345dfa599f3d864145f34e2d57426cf4fd75bdd..56162103cc79c2dad9038abb11d715c6e9909f45
--- 1/drivers/staging/lustre/lustre/llite/rw.c
--- 2/drivers/staging/lustre/lustre/llite/rw.c
+++ b/drivers/staging/lustre/lustre/llite/rw.c
@@@ -77,6 -77,12 +77,6 @@@ static void ll_cl_fini(struct ll_cl_con
                 cl_page_put(env, page);
         }
   
- -      if (io && lcc->lcc_created) {
- -              cl_io_end(env, io);
- -              cl_io_unlock(env, io);
- -              cl_io_iter_fini(env, io);
- -              cl_io_fini(env, io);
- -      }
         cl_env_put(env, &lcc->lcc_refcheck);
   }
   
@@@ -151,8 -157,7 +151,7 @@@ static struct ll_cl_context *ll_cl_init
                 result = cl_io_rw_init(env, io, CIT_WRITE, pos, PAGE_CACHE_SIZE);
                 if (result == 0) {
                         cio->cui_fd = LUSTRE_FPRIVATE(file);
-                       cio->cui_iov = NULL;
-                       cio->cui_nrsegs = 0;
+                       cio->cui_iter = NULL;
                         result = cl_io_iter_init(env, io);
                         if (result == 0) {
                                 result = cl_io_lock(env, io);
@@@ -161,6 -166,7 +160,6 @@@
                         }
                 } else
                         result = io->ci_result;
- -              lcc->lcc_created = 1;
         }
   
         lcc->lcc_io = io;
diff --combined drivers/staging/lustre/lustre/llite/rw26.c

index 55ca8d3c3e46451b654acdc298333035fcb11e4b,6b5994577b6b9f61bbb704341c72f0fa088183b7..af84c1aaa5f83f6c994a64da4b164535cbe54ac0
--- 1/drivers/staging/lustre/lustre/llite/rw26.c
--- 2/drivers/staging/lustre/lustre/llite/rw26.c
+++ b/drivers/staging/lustre/lustre/llite/rw26.c
@@@ -218,14 -218,11 +218,11 @@@ static void ll_free_user_pages(struct p
         int i;
   
         for (i = 0; i < npages; i++) {
-               if (pages[i] == NULL)
-                       break;
                 if (do_dirty)
                         set_page_dirty_lock(pages[i]);
                 page_cache_release(pages[i]);
         }
- 
-       OBD_FREE_LARGE(pages, npages * sizeof(*pages));
+       kvfree(pages);
   }
   
   ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
@@@ -363,18 -360,16 +360,16 @@@ static ssize_t ll_direct_IO_26_seg(cons
   #define MAX_DIO_SIZE ((MAX_MALLOC / sizeof(struct brw_page) * PAGE_CACHE_SIZE) & \
                       ~(DT_MAX_BRW_SIZE - 1))
   static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
-                              const struct iovec *iov, loff_t file_offset,
-                              unsigned long nr_segs)
+                              struct iov_iter *iter, loff_t file_offset)
   {
         struct lu_env *env;
         struct cl_io *io;
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
         struct ccc_object *obj = cl_inode2ccc(inode);
-       long count = iov_length(iov, nr_segs);
-       long tot_bytes = 0, result = 0;
+       ssize_t count = iov_iter_count(iter);
+       ssize_t tot_bytes = 0, result = 0;
         struct ll_inode_info *lli = ll_i2info(inode);
-       unsigned long seg = 0;
         long size = MAX_DIO_SIZE;
         int refcheck;
   
@@@ -385,18 -380,15 +380,15 @@@
         if ((file_offset & ~CFS_PAGE_MASK) || (count & ~CFS_PAGE_MASK))
                 return -EINVAL;
   
- -      CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), size=%lu (max %lu), "
- -             "offset=%lld=%llx, pages %lu (max %lu)\n",
+ +      CDEBUG(D_VFSTRACE,
+ +             "VFS Op:inode=%lu/%u(%p), size=%zd (max %lu), offset=%lld=%llx, pages %zd (max %lu)\n",
                inode->i_ino, inode->i_generation, inode, count, MAX_DIO_SIZE,
                file_offset, file_offset, count >> PAGE_CACHE_SHIFT,
                MAX_DIO_SIZE >> PAGE_CACHE_SHIFT);
   
         /* Check that all user buffers are aligned as well */
-       for (seg = 0; seg < nr_segs; seg++) {
-               if (((unsigned long)iov[seg].iov_base & ~CFS_PAGE_MASK) ||
-                   (iov[seg].iov_len & ~CFS_PAGE_MASK))
-                       return -EINVAL;
-       }
+       if (iov_iter_alignment(iter) & ~CFS_PAGE_MASK)
+               return -EINVAL;
   
         env = cl_env_get(&refcheck);
         LASSERT(!IS_ERR(env));
@@@ -411,63 -403,49 +403,49 @@@
                 mutex_lock(&inode->i_mutex);
   
         LASSERT(obj->cob_transient_pages == 0);
-       for (seg = 0; seg < nr_segs; seg++) {
-               long iov_left = iov[seg].iov_len;
-               unsigned long user_addr = (unsigned long)iov[seg].iov_base;
+       while (iov_iter_count(iter)) {
+               struct page **pages;
+               size_t offs;
   
+               count = min_t(size_t, iov_iter_count(iter), size);
                 if (rw == READ) {
                         if (file_offset >= i_size_read(inode))
                                 break;
-                       if (file_offset + iov_left > i_size_read(inode))
-                               iov_left = i_size_read(inode) - file_offset;
+                       if (file_offset + count > i_size_read(inode))
+                               count = i_size_read(inode) - file_offset;
                 }
   
-               while (iov_left > 0) {
-                       struct page **pages;
-                       int page_count, max_pages = 0;
-                       long bytes;
- 
-                       bytes = min(size, iov_left);
-                       page_count = ll_get_user_pages(rw, user_addr, bytes,
-                                                      &pages, &max_pages);
-                       if (likely(page_count > 0)) {
-                               if (unlikely(page_count <  max_pages))
-                                       bytes = page_count << PAGE_CACHE_SHIFT;
-                               result = ll_direct_IO_26_seg(env, io, rw, inode,
-                                                            file->f_mapping,
-                                                            bytes, file_offset,
-                                                            pages, page_count);
-                               ll_free_user_pages(pages, max_pages, rw==READ);
-                       } else if (page_count == 0) {
-                               GOTO(out, result = -EFAULT);
-                       } else {
-                               result = page_count;
-                       }
-                       if (unlikely(result <= 0)) {
-                               /* If we can't allocate a large enough buffer
-                                * for the request, shrink it to a smaller
-                                * PAGE_SIZE multiple and try again.
-                                * We should always be able to kmalloc for a
-                                * page worth of page pointers = 4MB on i386. */
-                               if (result == -ENOMEM &&
-                                   size > (PAGE_CACHE_SIZE / sizeof(*pages)) *
-                                          PAGE_CACHE_SIZE) {
-                                       size = ((((size / 2) - 1) |
-                                                ~CFS_PAGE_MASK) + 1) &
-                                               CFS_PAGE_MASK;
-                                       CDEBUG(D_VFSTRACE,"DIO size now %lu\n",
-                                              size);
-                                       continue;
-                               }
- 
-                               GOTO(out, result);
+               result = iov_iter_get_pages_alloc(iter, &pages, count, &offs);
+               if (likely(result > 0)) {
+                       int n = (result + offs + PAGE_SIZE - 1) / PAGE_SIZE;
+                       result = ll_direct_IO_26_seg(env, io, rw, inode,
+                                                    file->f_mapping,
+                                                    result, file_offset,
+                                                    pages, n);
+                       ll_free_user_pages(pages, n, rw==READ);
+               }
+               if (unlikely(result <= 0)) {
+                       /* If we can't allocate a large enough buffer
+                        * for the request, shrink it to a smaller
+                        * PAGE_SIZE multiple and try again.
+                        * We should always be able to kmalloc for a
+                        * page worth of page pointers = 4MB on i386. */
+                       if (result == -ENOMEM &&
+                           size > (PAGE_CACHE_SIZE / sizeof(*pages)) *
+                                  PAGE_CACHE_SIZE) {
+                               size = ((((size / 2) - 1) |
+                                        ~CFS_PAGE_MASK) + 1) &
+                                       CFS_PAGE_MASK;
+                               CDEBUG(D_VFSTRACE,"DIO size now %lu\n",
+                                      size);
+                               continue;
                         }
   
-                       tot_bytes += result;
-                       file_offset += result;
-                       iov_left -= result;
-                       user_addr += result;
+                       GOTO(out, result);
                 }
+               iov_iter_advance(iter, result);
+               tot_bytes += result;
+               file_offset += result;
         }
   out:
         LASSERT(obj->cob_transient_pages == 0);
@@@ -529,9 -507,9 +507,9 @@@ static int ll_write_end(struct file *fi
   }
   
   #ifdef CONFIG_MIGRATION
- -int ll_migratepage(struct address_space *mapping,
- -              struct page *newpage, struct page *page
- -              , enum migrate_mode mode
+ +static int ll_migratepage(struct address_space *mapping,
+ +                       struct page *newpage, struct page *page,
+ +                       enum migrate_mode mode
                 )
   {
         /* Always fail page migration until we have a proper implementation */
@@@ -540,8 -518,9 +518,8 @@@
   #endif
   
   #ifndef MS_HAS_NEW_AOPS
- -struct address_space_operations ll_aops = {
- -      .readpage       = ll_readpage,
- -//    .readpages      = ll_readpages,
+ +const struct address_space_operations ll_aops = {
+ +      .readpage       = ll_readpage,
         .direct_IO      = ll_direct_IO_26,
         .writepage      = ll_writepage,
         .writepages     = ll_writepages,
@@@ -553,9 -532,10 +531,9 @@@
   #ifdef CONFIG_MIGRATION
         .migratepage    = ll_migratepage,
   #endif
- -      .bmap      = NULL
   };
   #else
- -struct address_space_operations_ext ll_aops = {
+ +const struct address_space_operations_ext ll_aops = {
         .orig_aops.readpage       = ll_readpage,
   //    .orig_aops.readpages      = ll_readpages,
         .orig_aops.direct_IO      = ll_direct_IO_26,
@@@ -569,6 -549,7 +547,6 @@@
   #ifdef CONFIG_MIGRATION
         .orig_aops.migratepage    = ll_migratepage,
   #endif
- -      .orig_aops.bmap    = NULL,
         .write_begin    = ll_write_begin,
         .write_end      = ll_write_end
   };
diff --combined drivers/staging/lustre/lustre/llite/vvp_io.c

index 7dd2b4723c5fd6fdded98fadbcb63a68ce8c80fa,cfe8c625ae6403c72a6fa7513e7cce79fe8c8570..0e0b404cb5e6cc3b33dc8b736675485a617cdd55
--- 1/drivers/staging/lustre/lustre/llite/vvp_io.c
--- 2/drivers/staging/lustre/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_io.c
@@@ -80,7 -80,7 +80,7 @@@ static bool can_populate_pages(const st
         case CIT_WRITE:
                 /* don't need lock here to check lli_layout_gen as we have held
                  * extent lock and GROUP lock has to hold to swap layout */
- -              if (lli->lli_layout_gen != cio->cui_layout_gen) {
+ +              if (ll_layout_version_get(lli) != cio->cui_layout_gen) {
                         io->ci_need_restart = 1;
                         /* this will return application a short read/write */
                         io->ci_continue = 0;
@@@ -190,7 -190,7 +190,7 @@@ static void vvp_io_fault_fini(const str
         vvp_io_fini(env, ios);
   }
   
- -enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma)
+ +static enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma)
   {
         /*
          * we only want to hold PW locks if the mmap() can generate
@@@ -211,27 -211,26 +211,26 @@@ static int vvp_mmap_locks(const struct 
         struct cl_lock_descr   *descr = &cti->cti_descr;
         ldlm_policy_data_t      policy;
         unsigned long      addr;
-       unsigned long      seg;
         ssize_t          count;
         int                  result;
+       struct iov_iter i;
+       struct iovec iov;
   
         LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
   
         if (!cl_is_normalio(env, io))
                 return 0;
   
-       if (vio->cui_iov == NULL) /* nfs or loop back device write */
+       if (vio->cui_iter == NULL) /* nfs or loop back device write */
                 return 0;
   
         /* No MM (e.g. NFS)? No vmas too. */
         if (mm == NULL)
                 return 0;
   
-       for (seg = 0; seg < vio->cui_nrsegs; seg++) {
-               const struct iovec *iv = &vio->cui_iov[seg];
- 
-               addr = (unsigned long)iv->iov_base;
-               count = iv->iov_len;
+       iov_for_each(iov, i, *(vio->cui_iter)) {
+               addr = (unsigned long)iov.iov_base;
+               count = iov.iov_len;
                 if (count == 0)
                         continue;
   
@@@ -527,9 -526,7 +526,7 @@@ static int vvp_io_read_start(const stru
         switch (vio->cui_io_subtype) {
         case IO_NORMAL:
                 LASSERT(cio->cui_iocb->ki_pos == pos);
-               result = generic_file_aio_read(cio->cui_iocb,
-                                              cio->cui_iov, cio->cui_nrsegs,
-                                              cio->cui_iocb->ki_pos);
+               result = generic_file_read_iter(cio->cui_iocb, cio->cui_iter);
                 break;
         case IO_SPLICE:
                 result = generic_file_splice_read(file, &pos,
@@@ -595,12 -592,11 +592,11 @@@ static int vvp_io_write_start(const str
   
         CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt);
   
-       if (cio->cui_iov == NULL) /* from a temp io in ll_cl_init(). */
+       if (cio->cui_iter == NULL) /* from a temp io in ll_cl_init(). */
                 result = 0;
         else
-               result = generic_file_aio_write(cio->cui_iocb,
-                                               cio->cui_iov, cio->cui_nrsegs,
-                                               cio->cui_iocb->ki_pos);
+               result = generic_file_write_iter(cio->cui_iocb, cio->cui_iter);
+ 
         if (result > 0) {
                 if (result < cnt)
                         io->ci_continue = 0;
@@@ -1162,10 -1158,9 +1158,9 @@@ int vvp_io_init(const struct lu_env *en
                  *  results."  -- Single Unix Spec */
                 if (count == 0)
                         result = 1;
-               else {
+               else
                         cio->cui_tot_count = count;
-                       cio->cui_tot_nrsegs = 0;
-               }
+ 
                 /* for read/write, we store the jobid in the inode, and
                  * it'll be fetched by osc when building RPC.
                  *
diff --combined drivers/usb/gadget/storage_common.c

index ff205a7bc55c9aefcd98994aeb3c25fe4c37eb28,a8898df131ed5b4abdf37adb8784645e88840381..648f9e489b39bb3a291f091771dcd099013176db
--- 1/drivers/usb/gadget/storage_common.c
--- 2/drivers/usb/gadget/storage_common.c
+++ b/drivers/usb/gadget/storage_common.c
@@@ -43,7 -43,7 +43,7 @@@ struct usb_interface_descriptor fsg_int
         .bInterfaceProtocol =   USB_PR_BULK,    /* Adjusted during fsg_bind() */
         .iInterface =           FSG_STRING_INTERFACE,
   };
- -EXPORT_SYMBOL(fsg_intf_desc);
+ +EXPORT_SYMBOL_GPL(fsg_intf_desc);
   
   /*
    * Three full-speed endpoint descriptors: bulk-in, bulk-out, and
@@@ -58,7 -58,7 +58,7 @@@ struct usb_endpoint_descriptor fsg_fs_b
         .bmAttributes =         USB_ENDPOINT_XFER_BULK,
         /* wMaxPacketSize set by autoconfiguration */
   };
- -EXPORT_SYMBOL(fsg_fs_bulk_in_desc);
+ +EXPORT_SYMBOL_GPL(fsg_fs_bulk_in_desc);
   
   struct usb_endpoint_descriptor fsg_fs_bulk_out_desc = {
         .bLength =              USB_DT_ENDPOINT_SIZE,
@@@ -68,7 -68,7 +68,7 @@@
         .bmAttributes =         USB_ENDPOINT_XFER_BULK,
         /* wMaxPacketSize set by autoconfiguration */
   };
- -EXPORT_SYMBOL(fsg_fs_bulk_out_desc);
+ +EXPORT_SYMBOL_GPL(fsg_fs_bulk_out_desc);
   
   struct usb_descriptor_header *fsg_fs_function[] = {
         (struct usb_descriptor_header *) &fsg_intf_desc,
@@@ -76,7 -76,7 +76,7 @@@
         (struct usb_descriptor_header *) &fsg_fs_bulk_out_desc,
         NULL,
   };
- -EXPORT_SYMBOL(fsg_fs_function);
+ +EXPORT_SYMBOL_GPL(fsg_fs_function);
   
   
   /*
@@@ -95,7 -95,7 +95,7 @@@ struct usb_endpoint_descriptor fsg_hs_b
         .bmAttributes =         USB_ENDPOINT_XFER_BULK,
         .wMaxPacketSize =       cpu_to_le16(512),
   };
- -EXPORT_SYMBOL(fsg_hs_bulk_in_desc);
+ +EXPORT_SYMBOL_GPL(fsg_hs_bulk_in_desc);
   
   struct usb_endpoint_descriptor fsg_hs_bulk_out_desc = {
         .bLength =              USB_DT_ENDPOINT_SIZE,
@@@ -106,7 -106,7 +106,7 @@@
         .wMaxPacketSize =       cpu_to_le16(512),
         .bInterval =            1,      /* NAK every 1 uframe */
   };
- -EXPORT_SYMBOL(fsg_hs_bulk_out_desc);
+ +EXPORT_SYMBOL_GPL(fsg_hs_bulk_out_desc);
   
   
   struct usb_descriptor_header *fsg_hs_function[] = {
@@@ -115,7 -115,7 +115,7 @@@
         (struct usb_descriptor_header *) &fsg_hs_bulk_out_desc,
         NULL,
   };
- -EXPORT_SYMBOL(fsg_hs_function);
+ +EXPORT_SYMBOL_GPL(fsg_hs_function);
   
   struct usb_endpoint_descriptor fsg_ss_bulk_in_desc = {
         .bLength =              USB_DT_ENDPOINT_SIZE,
@@@ -125,7 -125,7 +125,7 @@@
         .bmAttributes =         USB_ENDPOINT_XFER_BULK,
         .wMaxPacketSize =       cpu_to_le16(1024),
   };
- -EXPORT_SYMBOL(fsg_ss_bulk_in_desc);
+ +EXPORT_SYMBOL_GPL(fsg_ss_bulk_in_desc);
   
   struct usb_ss_ep_comp_descriptor fsg_ss_bulk_in_comp_desc = {
         .bLength =              sizeof(fsg_ss_bulk_in_comp_desc),
@@@ -133,7 -133,7 +133,7 @@@
   
         /*.bMaxBurst =          DYNAMIC, */
   };
- -EXPORT_SYMBOL(fsg_ss_bulk_in_comp_desc);
+ +EXPORT_SYMBOL_GPL(fsg_ss_bulk_in_comp_desc);
   
   struct usb_endpoint_descriptor fsg_ss_bulk_out_desc = {
         .bLength =              USB_DT_ENDPOINT_SIZE,
@@@ -143,7 -143,7 +143,7 @@@
         .bmAttributes =         USB_ENDPOINT_XFER_BULK,
         .wMaxPacketSize =       cpu_to_le16(1024),
   };
- -EXPORT_SYMBOL(fsg_ss_bulk_out_desc);
+ +EXPORT_SYMBOL_GPL(fsg_ss_bulk_out_desc);
   
   struct usb_ss_ep_comp_descriptor fsg_ss_bulk_out_comp_desc = {
         .bLength =              sizeof(fsg_ss_bulk_in_comp_desc),
@@@ -151,7 -151,7 +151,7 @@@
   
         /*.bMaxBurst =          DYNAMIC, */
   };
- -EXPORT_SYMBOL(fsg_ss_bulk_out_comp_desc);
+ +EXPORT_SYMBOL_GPL(fsg_ss_bulk_out_comp_desc);
   
   struct usb_descriptor_header *fsg_ss_function[] = {
         (struct usb_descriptor_header *) &fsg_intf_desc,
@@@ -161,7 -161,7 +161,7 @@@
         (struct usb_descriptor_header *) &fsg_ss_bulk_out_comp_desc,
         NULL,
   };
- -EXPORT_SYMBOL(fsg_ss_function);
+ +EXPORT_SYMBOL_GPL(fsg_ss_function);
   
   
    /*-------------------------------------------------------------------------*/
@@@ -179,7 -179,7 +179,7 @@@ void fsg_lun_close(struct fsg_lun *curl
                 curlun->filp = NULL;
         }
   }
- -EXPORT_SYMBOL(fsg_lun_close);
+ +EXPORT_SYMBOL_GPL(fsg_lun_close);
   
   int fsg_lun_open(struct fsg_lun *curlun, const char *filename)
   {
@@@ -220,11 -220,11 +220,11 @@@
          * If we can't read the file, it's no good.
          * If we can't write the file, use it read-only.
          */
-       if (!(filp->f_op->read || filp->f_op->aio_read)) {
+       if (!(filp->f_mode & FMODE_CAN_READ)) {
                 LINFO(curlun, "file not readable: %s\n", filename);
                 goto out;
         }
-       if (!(filp->f_op->write || filp->f_op->aio_write))
+       if (!(filp->f_mode & FMODE_CAN_WRITE))
                 ro = 1;
   
         size = i_size_read(inode->i_mapping->host);
@@@ -278,7 -278,7 +278,7 @@@ out
         fput(filp);
         return rc;
   }
- -EXPORT_SYMBOL(fsg_lun_open);
+ +EXPORT_SYMBOL_GPL(fsg_lun_open);
   
   
   /*-------------------------------------------------------------------------*/
@@@ -295,7 -295,7 +295,7 @@@ int fsg_lun_fsync_sub(struct fsg_lun *c
                 return 0;
         return vfs_fsync(filp, 1);
   }
- -EXPORT_SYMBOL(fsg_lun_fsync_sub);
+ +EXPORT_SYMBOL_GPL(fsg_lun_fsync_sub);
   
   void store_cdrom_address(u8 *dest, int msf, u32 addr)
   {
@@@ -314,7 -314,7 +314,7 @@@
                 put_unaligned_be32(addr, dest);
         }
   }
- -EXPORT_SYMBOL(store_cdrom_address);
+ +EXPORT_SYMBOL_GPL(store_cdrom_address);
   
   /*-------------------------------------------------------------------------*/
   
@@@ -325,13 -325,13 +325,13 @@@ ssize_t fsg_show_ro(struct fsg_lun *cur
                                   ? curlun->ro
                                   : curlun->initially_ro);
   }
- -EXPORT_SYMBOL(fsg_show_ro);
+ +EXPORT_SYMBOL_GPL(fsg_show_ro);
   
   ssize_t fsg_show_nofua(struct fsg_lun *curlun, char *buf)
   {
         return sprintf(buf, "%u\n", curlun->nofua);
   }
- -EXPORT_SYMBOL(fsg_show_nofua);
+ +EXPORT_SYMBOL_GPL(fsg_show_nofua);
   
   ssize_t fsg_show_file(struct fsg_lun *curlun, struct rw_semaphore *filesem,
                       char *buf)
@@@ -357,19 -357,19 +357,19 @@@
         up_read(filesem);
         return rc;
   }
- -EXPORT_SYMBOL(fsg_show_file);
+ +EXPORT_SYMBOL_GPL(fsg_show_file);
   
   ssize_t fsg_show_cdrom(struct fsg_lun *curlun, char *buf)
   {
         return sprintf(buf, "%u\n", curlun->cdrom);
   }
- -EXPORT_SYMBOL(fsg_show_cdrom);
+ +EXPORT_SYMBOL_GPL(fsg_show_cdrom);
   
   ssize_t fsg_show_removable(struct fsg_lun *curlun, char *buf)
   {
         return sprintf(buf, "%u\n", curlun->removable);
   }
- -EXPORT_SYMBOL(fsg_show_removable);
+ +EXPORT_SYMBOL_GPL(fsg_show_removable);
   
   /*
    * The caller must hold fsg->filesem for reading when calling this function.
@@@ -410,7 -410,7 +410,7 @@@ ssize_t fsg_store_ro(struct fsg_lun *cu
   
         return rc;
   }
- -EXPORT_SYMBOL(fsg_store_ro);
+ +EXPORT_SYMBOL_GPL(fsg_store_ro);
   
   ssize_t fsg_store_nofua(struct fsg_lun *curlun, const char *buf, size_t count)
   {
@@@ -429,7 -429,7 +429,7 @@@
   
         return count;
   }
- -EXPORT_SYMBOL(fsg_store_nofua);
+ +EXPORT_SYMBOL_GPL(fsg_store_nofua);
   
   ssize_t fsg_store_file(struct fsg_lun *curlun, struct rw_semaphore *filesem,
                        const char *buf, size_t count)
@@@ -460,7 -460,7 +460,7 @@@
         up_write(filesem);
         return (rc < 0 ? rc : count);
   }
- -EXPORT_SYMBOL(fsg_store_file);
+ +EXPORT_SYMBOL_GPL(fsg_store_file);
   
   ssize_t fsg_store_cdrom(struct fsg_lun *curlun, struct rw_semaphore *filesem,
                         const char *buf, size_t count)
@@@ -483,7 -483,7 +483,7 @@@
   
         return ret;
   }
- -EXPORT_SYMBOL(fsg_store_cdrom);
+ +EXPORT_SYMBOL_GPL(fsg_store_cdrom);
   
   ssize_t fsg_store_removable(struct fsg_lun *curlun, const char *buf,
                             size_t count)
@@@ -499,6 -499,6 +499,6 @@@
   
         return count;
   }
- -EXPORT_SYMBOL(fsg_store_removable);
+ +EXPORT_SYMBOL_GPL(fsg_store_removable);
   
   MODULE_LICENSE("GPL");
diff --combined fs/9p/vfs_file.c

index 96e550760699a8895cbb58a4dd26a18ab1e050a3,b9b5f979a2ca7baafa426037229f6d208dd3710d..520c11c2dcca4c9ff31a591600ca0c8ced52481c
--- 1/fs/9p/vfs_file.c
--- 2/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@@ -352,6 -352,9 +352,6 @@@ static int v9fs_file_flock_dotl(struct 
                 invalidate_mapping_pages(&inode->i_data, 0, -1);
         }
         /* Convert flock to posix lock */
- -      fl->fl_owner = (fl_owner_t)filp;
- -      fl->fl_start = 0;
- -      fl->fl_end = OFFSET_MAX;
         fl->fl_flags |= FL_POSIX;
         fl->fl_flags ^= FL_FLOCK;
   
@@@ -681,7 -684,7 +681,7 @@@ v9fs_direct_read(struct file *filp, cha
   /**
    * v9fs_cached_file_read - read from a file
    * @filp: file pointer to read
- - * @udata: user data buffer to read data into
+ + * @data: user data buffer to read data into
    * @count: size of buffer
    * @offset: offset at which to read data
    *
@@@ -692,13 -695,13 +692,13 @@@ v9fs_cached_file_read(struct file *filp
   {
         if (filp->f_flags & O_DIRECT)
                 return v9fs_direct_read(filp, data, count, offset);
-       return do_sync_read(filp, data, count, offset);
+       return new_sync_read(filp, data, count, offset);
   }
   
   /**
    * v9fs_mmap_file_read - read from a file
    * @filp: file pointer to read
- - * @udata: user data buffer to read data into
+ + * @data: user data buffer to read data into
    * @count: size of buffer
    * @offset: offset at which to read data
    *
@@@ -760,7 -763,7 +760,7 @@@ err_out
   
   buff_write:
         mutex_unlock(&inode->i_mutex);
-       return do_sync_write(filp, data, count, offsetp);
+       return new_sync_write(filp, data, count, offsetp);
   }
   
   /**
@@@ -778,7 -781,7 +778,7 @@@ v9fs_cached_file_write(struct file *fil
   
         if (filp->f_flags & O_DIRECT)
                 return v9fs_direct_write(filp, data, count, offset);
-       return do_sync_write(filp, data, count, offset);
+       return new_sync_write(filp, data, count, offset);
   }
   
   
@@@ -847,8 -850,8 +847,8 @@@ const struct file_operations v9fs_cache
         .llseek = generic_file_llseek,
         .read = v9fs_cached_file_read,
         .write = v9fs_cached_file_write,
-       .aio_read = generic_file_aio_read,
-       .aio_write = generic_file_aio_write,
+       .read_iter = generic_file_read_iter,
+       .write_iter = generic_file_write_iter,
         .open = v9fs_file_open,
         .release = v9fs_dir_release,
         .lock = v9fs_file_lock,
@@@ -860,8 -863,8 +860,8 @@@ const struct file_operations v9fs_cache
         .llseek = generic_file_llseek,
         .read = v9fs_cached_file_read,
         .write = v9fs_cached_file_write,
-       .aio_read = generic_file_aio_read,
-       .aio_write = generic_file_aio_write,
+       .read_iter = generic_file_read_iter,
+       .write_iter = generic_file_write_iter,
         .open = v9fs_file_open,
         .release = v9fs_dir_release,
         .lock = v9fs_file_lock_dotl,
diff --combined fs/affs/file.c

index 0270303388ee669515c8829f7370da24db6d16e2,9df23175e28b910e5cec924dd87249956d4051a7..a7fe57d2cd9a0aa6a59df2cd90778127ebc2bbc3
--- 1/fs/affs/file.c
--- 2/fs/affs/file.c
+++ b/fs/affs/file.c
@@@ -27,10 -27,10 +27,10 @@@ static int affs_file_release(struct ino
   
   const struct file_operations affs_file_operations = {
         .llseek         = generic_file_llseek,
-       .read           = do_sync_read,
-       .aio_read       = generic_file_aio_read,
-       .write          = do_sync_write,
-       .aio_write      = generic_file_aio_write,
+       .read           = new_sync_read,
+       .read_iter      = generic_file_read_iter,
+       .write          = new_sync_write,
+       .write_iter     = generic_file_write_iter,
         .mmap           = generic_file_mmap,
         .open           = affs_file_open,
         .release        = affs_file_release,
@@@ -45,7 -45,7 +45,7 @@@ const struct inode_operations affs_file
   static int
   affs_file_open(struct inode *inode, struct file *filp)
   {
- -      pr_debug("AFFS: open(%lu,%d)\n",
+ +      pr_debug("open(%lu,%d)\n",
                  inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
         atomic_inc(&AFFS_I(inode)->i_opencnt);
         return 0;
@@@ -54,7 -54,7 +54,7 @@@
   static int
   affs_file_release(struct inode *inode, struct file *filp)
   {
- -      pr_debug("AFFS: release(%lu, %d)\n",
+ +      pr_debug("release(%lu, %d)\n",
                  inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
   
         if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) {
@@@ -324,8 -324,7 +324,8 @@@ affs_get_block(struct inode *inode, sec
         struct buffer_head      *ext_bh;
         u32                      ext;
   
- -      pr_debug("AFFS: get_block(%u, %lu)\n", (u32)inode->i_ino, (unsigned long)block);
+ +      pr_debug("%s(%u, %lu)\n",
+ +               __func__, (u32)inode->i_ino, (unsigned long)block);
   
         BUG_ON(block > (sector_t)0x7fffffffUL);
   
@@@ -499,36 -498,34 +499,36 @@@ affs_getemptyblk_ino(struct inode *inod
   }
   
   static int
- -affs_do_readpage_ofs(struct file *file, struct page *page, unsigned from, unsigned to)
+ +affs_do_readpage_ofs(struct page *page, unsigned to)
   {
         struct inode *inode = page->mapping->host;
         struct super_block *sb = inode->i_sb;
         struct buffer_head *bh;
         char *data;
+ +      unsigned pos = 0;
         u32 bidx, boff, bsize;
         u32 tmp;
   
- -      pr_debug("AFFS: read_page(%u, %ld, %d, %d)\n", (u32)inode->i_ino, page->index, from, to);
- -      BUG_ON(from > to || to > PAGE_CACHE_SIZE);
+ +      pr_debug("%s(%u, %ld, 0, %d)\n", __func__, (u32)inode->i_ino,
+ +               page->index, to);
+ +      BUG_ON(to > PAGE_CACHE_SIZE);
         kmap(page);
         data = page_address(page);
         bsize = AFFS_SB(sb)->s_data_blksize;
- -      tmp = (page->index << PAGE_CACHE_SHIFT) + from;
+ +      tmp = page->index << PAGE_CACHE_SHIFT;
         bidx = tmp / bsize;
         boff = tmp % bsize;
   
- -      while (from < to) {
+ +      while (pos < to) {
                 bh = affs_bread_ino(inode, bidx, 0);
                 if (IS_ERR(bh))
                         return PTR_ERR(bh);
- -              tmp = min(bsize - boff, to - from);
- -              BUG_ON(from + tmp > to || tmp > bsize);
- -              memcpy(data + from, AFFS_DATA(bh) + boff, tmp);
+ +              tmp = min(bsize - boff, to - pos);
+ +              BUG_ON(pos + tmp > to || tmp > bsize);
+ +              memcpy(data + pos, AFFS_DATA(bh) + boff, tmp);
                 affs_brelse(bh);
                 bidx++;
- -              from += tmp;
+ +              pos += tmp;
                 boff = 0;
         }
         flush_dcache_page(page);
@@@ -545,7 -542,7 +545,7 @@@ affs_extent_file_ofs(struct inode *inod
         u32 size, bsize;
         u32 tmp;
   
- -      pr_debug("AFFS: extent_file(%u, %d)\n", (u32)inode->i_ino, newsize);
+ +      pr_debug("%s(%u, %d)\n", __func__, (u32)inode->i_ino, newsize);
         bsize = AFFS_SB(sb)->s_data_blksize;
         bh = NULL;
         size = AFFS_I(inode)->mmu_private;
@@@ -611,14 -608,14 +611,14 @@@ affs_readpage_ofs(struct file *file, st
         u32 to;
         int err;
   
- -      pr_debug("AFFS: read_page(%u, %ld)\n", (u32)inode->i_ino, page->index);
+ +      pr_debug("%s(%u, %ld)\n", __func__, (u32)inode->i_ino, page->index);
         to = PAGE_CACHE_SIZE;
         if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) {
                 to = inode->i_size & ~PAGE_CACHE_MASK;
                 memset(page_address(page) + to, 0, PAGE_CACHE_SIZE - to);
         }
   
- -      err = affs_do_readpage_ofs(file, page, 0, to);
+ +      err = affs_do_readpage_ofs(page, to);
         if (!err)
                 SetPageUptodate(page);
         unlock_page(page);
@@@ -634,8 -631,7 +634,8 @@@ static int affs_write_begin_ofs(struct 
         pgoff_t index;
         int err = 0;
   
- -      pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len);
+ +      pr_debug("%s(%u, %llu, %llu)\n", __func__, (u32)inode->i_ino,
+ +               (unsigned long long)pos, (unsigned long long)pos + len);
         if (pos > AFFS_I(inode)->mmu_private) {
                 /* XXX: this probably leaves a too-big i_size in case of
                  * failure. Should really be updating i_size at write_end time
@@@ -655,7 -651,7 +655,7 @@@
                 return 0;
   
         /* XXX: inefficient but safe in the face of short writes */
- -      err = affs_do_readpage_ofs(file, page, 0, PAGE_CACHE_SIZE);
+ +      err = affs_do_readpage_ofs(page, PAGE_CACHE_SIZE);
         if (err) {
                 unlock_page(page);
                 page_cache_release(page);
@@@ -684,9 -680,7 +684,9 @@@ static int affs_write_end_ofs(struct fi
          * due to write_begin.
          */
   
- -      pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len);
+ +      pr_debug("%s(%u, %llu, %llu)\n",
+ +               __func__, (u32)inode->i_ino, (unsigned long long)pos,
+ +              (unsigned long long)pos + len);
         bsize = AFFS_SB(sb)->s_data_blksize;
         data = page_address(page);
   
@@@ -808,7 -802,7 +808,7 @@@ affs_free_prealloc(struct inode *inode
   {
         struct super_block *sb = inode->i_sb;
   
- -      pr_debug("AFFS: free_prealloc(ino=%lu)\n", inode->i_ino);
+ +      pr_debug("free_prealloc(ino=%lu)\n", inode->i_ino);
   
         while (AFFS_I(inode)->i_pa_cnt) {
                 AFFS_I(inode)->i_pa_cnt--;
@@@ -828,7 -822,7 +828,7 @@@ affs_truncate(struct inode *inode
         struct buffer_head *ext_bh;
         int i;
   
- -      pr_debug("AFFS: truncate(inode=%d, oldsize=%u, newsize=%u)\n",
+ +      pr_debug("truncate(inode=%d, oldsize=%u, newsize=%u)\n",
                  (u32)inode->i_ino, (u32)AFFS_I(inode)->mmu_private, (u32)inode->i_size);
   
         last_blk = 0;
diff --combined fs/block_dev.c

index 83fba15cc394071a53b57245b95942c18177d9c7,e68e150b1b163c15da172cfa60ed832d14841495..6d7274619bf916c2dcf0d7744ba8d888d948d711
--- 1/fs/block_dev.c
--- 2/fs/block_dev.c
+++ b/fs/block_dev.c
@@@ -165,14 -165,15 +165,15 @@@ blkdev_get_block(struct inode *inode, s
   }
   
   static ssize_t
- blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-                       loff_t offset, unsigned long nr_segs)
+ blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+                       loff_t offset)
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
   
-       return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
-                                   nr_segs, blkdev_get_block, NULL, NULL, 0);
+       return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter,
+                                   offset, blkdev_get_block,
+                                   NULL, NULL, 0);
   }
   
   int __sync_blockdev(struct block_device *bdev, int wait)
@@@ -363,69 -364,6 +364,69 @@@ int blkdev_fsync(struct file *filp, lof
   }
   EXPORT_SYMBOL(blkdev_fsync);
   
+ +/**
+ + * bdev_read_page() - Start reading a page from a block device
+ + * @bdev: The device to read the page from
+ + * @sector: The offset on the device to read the page to (need not be aligned)
+ + * @page: The page to read
+ + *
+ + * On entry, the page should be locked.  It will be unlocked when the page
+ + * has been read.  If the block driver implements rw_page synchronously,
+ + * that will be true on exit from this function, but it need not be.
+ + *
+ + * Errors returned by this function are usually "soft", eg out of memory, or
+ + * queue full; callers should try a different route to read this page rather
+ + * than propagate an error back up the stack.
+ + *
+ + * Return: negative errno if an error occurs, 0 if submission was successful.
+ + */
+ +int bdev_read_page(struct block_device *bdev, sector_t sector,
+ +                      struct page *page)
+ +{
+ +      const struct block_device_operations *ops = bdev->bd_disk->fops;
+ +      if (!ops->rw_page)
+ +              return -EOPNOTSUPP;
+ +      return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
+ +}
+ +EXPORT_SYMBOL_GPL(bdev_read_page);
+ +
+ +/**
+ + * bdev_write_page() - Start writing a page to a block device
+ + * @bdev: The device to write the page to
+ + * @sector: The offset on the device to write the page to (need not be aligned)
+ + * @page: The page to write
+ + * @wbc: The writeback_control for the write
+ + *
+ + * On entry, the page should be locked and not currently under writeback.
+ + * On exit, if the write started successfully, the page will be unlocked and
+ + * under writeback.  If the write failed already (eg the driver failed to
+ + * queue the page to the device), the page will still be locked.  If the
+ + * caller is a ->writepage implementation, it will need to unlock the page.
+ + *
+ + * Errors returned by this function are usually "soft", eg out of memory, or
+ + * queue full; callers should try a different route to write this page rather
+ + * than propagate an error back up the stack.
+ + *
+ + * Return: negative errno if an error occurs, 0 if submission was successful.
+ + */
+ +int bdev_write_page(struct block_device *bdev, sector_t sector,
+ +                      struct page *page, struct writeback_control *wbc)
+ +{
+ +      int result;
+ +      int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
+ +      const struct block_device_operations *ops = bdev->bd_disk->fops;
+ +      if (!ops->rw_page)
+ +              return -EOPNOTSUPP;
+ +      set_page_writeback(page);
+ +      result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
+ +      if (result)
+ +              end_page_writeback(page);
+ +      else
+ +              unlock_page(page);
+ +      return result;
+ +}
+ +EXPORT_SYMBOL_GPL(bdev_write_page);
+ +
   /*
    * pseudo-fs
    */
@@@ -1571,43 -1509,38 +1572,38 @@@ static long block_ioctl(struct file *fi
    * Does not take i_mutex for the write and thus is not for general purpose
    * use.
    */
- ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                        unsigned long nr_segs, loff_t pos)
+ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
   {
         struct file *file = iocb->ki_filp;
         struct blk_plug plug;
         ssize_t ret;
   
-       BUG_ON(iocb->ki_pos != pos);
- 
         blk_start_plug(&plug);
-       ret = __generic_file_aio_write(iocb, iov, nr_segs);
+       ret = __generic_file_write_iter(iocb, from);
         if (ret > 0) {
                 ssize_t err;
- 
-               err = generic_write_sync(file, pos, ret);
+               err = generic_write_sync(file, iocb->ki_pos - ret, ret);
                 if (err < 0)
                         ret = err;
         }
         blk_finish_plug(&plug);
         return ret;
   }
- EXPORT_SYMBOL_GPL(blkdev_aio_write);
+ EXPORT_SYMBOL_GPL(blkdev_write_iter);
   
- static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
-                        unsigned long nr_segs, loff_t pos)
+ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
   {
         struct file *file = iocb->ki_filp;
         struct inode *bd_inode = file->f_mapping->host;
         loff_t size = i_size_read(bd_inode);
+       loff_t pos = iocb->ki_pos;
   
         if (pos >= size)
                 return 0;
   
         size -= pos;
-       if (size < iocb->ki_nbytes)
-               nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
-       return generic_file_aio_read(iocb, iov, nr_segs, pos);
+       iov_iter_truncate(to, size);
+       return generic_file_read_iter(iocb, to);
   }
   
   /*
@@@ -1639,10 -1572,10 +1635,10 @@@ const struct file_operations def_blk_fo
         .open           = blkdev_open,
         .release        = blkdev_close,
         .llseek         = block_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = blkdev_aio_read,
-       .aio_write      = blkdev_aio_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = blkdev_read_iter,
+       .write_iter     = blkdev_write_iter,
         .mmap           = generic_file_mmap,
         .fsync          = blkdev_fsync,
         .unlocked_ioctl = block_ioctl,
@@@ -1650,7 -1583,7 +1646,7 @@@
         .compat_ioctl   = compat_blkdev_ioctl,
   #endif
         .splice_read    = generic_file_splice_read,
-       .splice_write   = generic_file_splice_write,
+       .splice_write   = iter_file_splice_write,
   };
   
   int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
diff --combined fs/btrfs/file.c

index e472441feb5de80ce68c85e6f892531893706a13,17e7393c50f0a97484ec9ce7898c9b6cf7aacdfa..1f2b99cb55eaef682c51ae3c8b227e88aafbf7e7
--- 1/fs/btrfs/file.c
--- 2/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@@ -40,7 -40,6 +40,7 @@@
   #include "tree-log.h"
   #include "locking.h"
   #include "volumes.h"
+ +#include "qgroup.h"
   
   static struct kmem_cache *btrfs_inode_defrag_cachep;
   /*
@@@ -448,7 -447,7 +448,7 @@@ static noinline int btrfs_copy_from_use
                 write_bytes -= copied;
                 total_copied += copied;
   
-               /* Return to btrfs_file_aio_write to fault page */
+               /* Return to btrfs_file_write_iter to fault page */
                 if (unlikely(copied == 0))
                         break;
   
@@@ -471,12 -470,11 +471,12 @@@ static void btrfs_drop_pages(struct pag
         for (i = 0; i < num_pages; i++) {
                 /* page checked is some magic around finding pages that
                  * have been modified without going through btrfs_set_page_dirty
- -               * clear it here
+ +               * clear it here. There should be no need to mark the pages
+ +               * accessed as prepare_pages should have marked them accessed
+ +               * in prepare_pages via find_or_create_page()
                  */
                 ClearPageChecked(pages[i]);
                 unlock_page(pages[i]);
- -              mark_page_accessed(pages[i]);
                 page_cache_release(pages[i]);
         }
   }
@@@ -716,7 -714,7 +716,7 @@@ int __btrfs_drop_extents(struct btrfs_t
         int recow;
         int ret;
         int modify_tree = -1;
- -      int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
+ +      int update_refs;
         int found = 0;
         int leafs_visited = 0;
   
@@@ -726,8 -724,6 +726,8 @@@
         if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
                 modify_tree = 0;
   
+ +      update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ +                     root == root->fs_info->tree_root);
         while (1) {
                 recow = 0;
                 ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@@ -784,18 -780,6 +784,18 @@@ next_slot
                         extent_end = search_start;
                 }
   
+ +              /*
+ +               * Don't skip extent items representing 0 byte lengths. They
+ +               * used to be created (bug) if while punching holes we hit
+ +               * -ENOSPC condition. So if we find one here, just ensure we
+ +               * delete it, otherwise we would insert a new file extent item
+ +               * with the same key (offset) as that 0 bytes length file
+ +               * extent item in the call to setup_items_for_insert() later
+ +               * in this function.
+ +               */
+ +              if (extent_end == key.offset && extent_end >= search_start)
+ +                      goto delete_extent_item;
+ +
                 if (extent_end <= search_start) {
                         path->slots[0]++;
                         goto next_slot;
@@@ -851,7 -835,7 +851,7 @@@
                                                 disk_bytenr, num_bytes, 0,
                                                 root->root_key.objectid,
                                                 new_key.objectid,
- -                                              start - extent_offset, 0);
+ +                                              start - extent_offset, 1);
                                 BUG_ON(ret); /* -ENOMEM */
                         }
                         key.offset = start;
@@@ -909,7 -893,6 +909,7 @@@
                  *    | ------ extent ------ |
                  */
                 if (start <= key.offset && end >= extent_end) {
+ +delete_extent_item:
                         if (del_nr == 0) {
                                 del_slot = path->slots[0];
                                 del_nr = 1;
@@@ -1208,7 -1191,7 +1208,7 @@@ again
   
                 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
                                            root->root_key.objectid,
- -                                         ino, orig_offset, 0);
+ +                                         ino, orig_offset, 1);
                 BUG_ON(ret); /* -ENOMEM */
   
                 if (split == start) {
@@@ -1675,27 -1658,22 +1675,22 @@@ again
   }
   
   static ssize_t __btrfs_direct_write(struct kiocb *iocb,
-                                   const struct iovec *iov,
-                                   unsigned long nr_segs, loff_t pos,
-                                   size_t count, size_t ocount)
+                                   struct iov_iter *from,
+                                   loff_t pos)
   {
         struct file *file = iocb->ki_filp;
-       struct iov_iter i;
         ssize_t written;
         ssize_t written_buffered;
         loff_t endbyte;
         int err;
   
-       written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
-                                           count, ocount);
+       written = generic_file_direct_write(iocb, from, pos);
   
-       if (written < 0 || written == count)
+       if (written < 0 || !iov_iter_count(from))
                 return written;
   
         pos += written;
-       count -= written;
-       iov_iter_init(&i, iov, nr_segs, count, written);
-       written_buffered = __btrfs_buffered_write(file, &i, pos);
+       written_buffered = __btrfs_buffered_write(file, from, pos);
         if (written_buffered < 0) {
                 err = written_buffered;
                 goto out;
@@@ -1730,9 -1708,8 +1725,8 @@@ static void update_time_for_write(struc
                 inode_inc_iversion(inode);
   }
   
- static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
-                                   const struct iovec *iov,
-                                   unsigned long nr_segs, loff_t pos)
+ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
+                                   struct iov_iter *from)
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file_inode(file);
@@@ -1741,18 -1718,12 +1735,12 @@@
         u64 end_pos;
         ssize_t num_written = 0;
         ssize_t err = 0;
-       size_t count, ocount;
+       size_t count = iov_iter_count(from);
         bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
+       loff_t pos = iocb->ki_pos;
   
         mutex_lock(&inode->i_mutex);
   
-       err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-       if (err) {
-               mutex_unlock(&inode->i_mutex);
-               goto out;
-       }
-       count = ocount;
- 
         current->backing_dev_info = inode->i_mapping->backing_dev_info;
         err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
         if (err) {
@@@ -1765,6 -1736,8 +1753,8 @@@
                 goto out;
         }
   
+       iov_iter_truncate(from, count);
+ 
         err = file_remove_suid(file);
         if (err) {
                 mutex_unlock(&inode->i_mutex);
@@@ -1806,14 -1779,9 +1796,9 @@@
                 atomic_inc(&BTRFS_I(inode)->sync_writers);
   
         if (unlikely(file->f_flags & O_DIRECT)) {
-               num_written = __btrfs_direct_write(iocb, iov, nr_segs,
-                                                  pos, count, ocount);
+               num_written = __btrfs_direct_write(iocb, from, pos);
         } else {
-               struct iov_iter i;
- 
-               iov_iter_init(&i, iov, nr_segs, count, num_written);
- 
-               num_written = __btrfs_buffered_write(file, &i, pos);
+               num_written = __btrfs_buffered_write(file, from, pos);
                 if (num_written > 0)
                         iocb->ki_pos = pos + num_written;
         }
@@@ -2026,10 -1994,8 +2011,10 @@@ int btrfs_sync_file(struct file *file, 
                 if (!full_sync) {
                         ret = btrfs_wait_ordered_range(inode, start,
                                                        end - start + 1);
- -                      if (ret)
+ +                      if (ret) {
+ +                              btrfs_end_transaction(trans, root);
                                 goto out;
+ +                      }
                 }
                 ret = btrfs_commit_transaction(trans, root);
         } else {
@@@ -2187,37 -2153,6 +2172,37 @@@ out
         return 0;
   }
   
+ +/*
+ + * Find a hole extent on given inode and change start/len to the end of hole
+ + * extent.(hole/vacuum extent whose em->start <= start &&
+ + *       em->start + em->len > start)
+ + * When a hole extent is found, return 1 and modify start/len.
+ + */
+ +static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
+ +{
+ +      struct extent_map *em;
+ +      int ret = 0;
+ +
+ +      em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0);
+ +      if (IS_ERR_OR_NULL(em)) {
+ +              if (!em)
+ +                      ret = -ENOMEM;
+ +              else
+ +                      ret = PTR_ERR(em);
+ +              return ret;
+ +      }
+ +
+ +      /* Hole or vacuum extent(only exists in no-hole mode) */
+ +      if (em->block_start == EXTENT_MAP_HOLE) {
+ +              ret = 1;
+ +              *len = em->start + em->len > *start + *len ?
+ +                     0 : *start + *len - em->start - em->len;
+ +              *start = em->start + em->len;
+ +      }
+ +      free_extent_map(em);
+ +      return ret;
+ +}
+ +
   static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
   {
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@@ -2225,42 -2160,25 +2210,42 @@@
         struct btrfs_path *path;
         struct btrfs_block_rsv *rsv;
         struct btrfs_trans_handle *trans;
- -      u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
- -      u64 lockend = round_down(offset + len,
- -                               BTRFS_I(inode)->root->sectorsize) - 1;
- -      u64 cur_offset = lockstart;
+ +      u64 lockstart;
+ +      u64 lockend;
+ +      u64 tail_start;
+ +      u64 tail_len;
+ +      u64 orig_start = offset;
+ +      u64 cur_offset;
         u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
         u64 drop_end;
         int ret = 0;
         int err = 0;
         int rsv_count;
- -      bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
- -                        ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+ +      bool same_page;
         bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
- -      u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
+ +      u64 ino_size;
   
         ret = btrfs_wait_ordered_range(inode, offset, len);
         if (ret)
                 return ret;
   
         mutex_lock(&inode->i_mutex);
+ +      ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
+ +      ret = find_first_non_hole(inode, &offset, &len);
+ +      if (ret < 0)
+ +              goto out_only_mutex;
+ +      if (ret && !len) {
+ +              /* Already in a large hole */
+ +              ret = 0;
+ +              goto out_only_mutex;
+ +      }
+ +
+ +      lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize);
+ +      lockend = round_down(offset + len,
+ +                           BTRFS_I(inode)->root->sectorsize) - 1;
+ +      same_page = ((offset >> PAGE_CACHE_SHIFT) ==
+ +                  ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+ +
         /*
          * We needn't truncate any page which is beyond the end of the file
          * because we are sure there is no data there.
@@@ -2272,7 -2190,8 +2257,7 @@@
         if (same_page && len < PAGE_CACHE_SIZE) {
                 if (offset < ino_size)
                         ret = btrfs_truncate_page(inode, offset, len, 0);
- -              mutex_unlock(&inode->i_mutex);
- -              return ret;
+ +              goto out_only_mutex;
         }
   
         /* zero back part of the first page */
@@@ -2284,39 -2203,12 +2269,39 @@@
                 }
         }
   
- -      /* zero the front end of the last page */
- -      if (offset + len < ino_size) {
- -              ret = btrfs_truncate_page(inode, offset + len, 0, 1);
- -              if (ret) {
- -                      mutex_unlock(&inode->i_mutex);
- -                      return ret;
+ +      /* Check the aligned pages after the first unaligned page,
+ +       * if offset != orig_start, which means the first unaligned page
+ +       * including serveral following pages are already in holes,
+ +       * the extra check can be skipped */
+ +      if (offset == orig_start) {
+ +              /* after truncate page, check hole again */
+ +              len = offset + len - lockstart;
+ +              offset = lockstart;
+ +              ret = find_first_non_hole(inode, &offset, &len);
+ +              if (ret < 0)
+ +                      goto out_only_mutex;
+ +              if (ret && !len) {
+ +                      ret = 0;
+ +                      goto out_only_mutex;
+ +              }
+ +              lockstart = offset;
+ +      }
+ +
+ +      /* Check the tail unaligned part is in a hole */
+ +      tail_start = lockend + 1;
+ +      tail_len = offset + len - tail_start;
+ +      if (tail_len) {
+ +              ret = find_first_non_hole(inode, &tail_start, &tail_len);
+ +              if (unlikely(ret < 0))
+ +                      goto out_only_mutex;
+ +              if (!ret) {
+ +                      /* zero the front end of the last page */
+ +                      if (tail_start + tail_len < ino_size) {
+ +                              ret = btrfs_truncate_page(inode,
+ +                                              tail_start + tail_len, 0, 1);
+ +                              if (ret)
+ +                                      goto out_only_mutex;
+ +                              }
                 }
         }
   
@@@ -2342,7 -2234,9 +2327,7 @@@
                 if ((!ordered ||
                     (ordered->file_offset + ordered->len <= lockstart ||
                      ordered->file_offset > lockend)) &&
- -                   !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
- -                                   lockend, EXTENT_UPTODATE, 0,
- -                                   cached_state)) {
+ +                   !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
                         if (ordered)
                                 btrfs_put_ordered_extent(ordered);
                         break;
@@@ -2390,8 -2284,6 +2375,8 @@@
         BUG_ON(ret);
         trans->block_rsv = rsv;
   
+ +      cur_offset = lockstart;
+ +      len = lockend - cur_offset;
         while (cur_offset < lockend) {
                 ret = __btrfs_drop_extents(trans, root, inode, path,
                                            cur_offset, lockend + 1,
@@@ -2432,14 -2324,6 +2417,14 @@@
                                               rsv, min_size);
                 BUG_ON(ret);    /* shouldn't happen */
                 trans->block_rsv = rsv;
+ +
+ +              ret = find_first_non_hole(inode, &cur_offset, &len);
+ +              if (unlikely(ret < 0))
+ +                      break;
+ +              if (ret && !len) {
+ +                      ret = 0;
+ +                      break;
+ +              }
         }
   
         if (ret) {
@@@ -2448,12 -2332,7 +2433,12 @@@
         }
   
         trans->block_rsv = &root->fs_info->trans_block_rsv;
- -      if (cur_offset < ino_size) {
+ +      /*
+ +       * Don't insert file hole extent item if it's for a range beyond eof
+ +       * (because it's useless) or if it represents a 0 bytes range (when
+ +       * cur_offset == drop_end).
+ +       */
+ +      if (cur_offset < ino_size && cur_offset < drop_end) {
                 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
                 if (ret) {
                         err = ret;
@@@ -2478,7 -2357,6 +2463,7 @@@ out_free
   out:
         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                              &cached_state, GFP_NOFS);
+ +out_only_mutex:
         mutex_unlock(&inode->i_mutex);
         if (ret && !err)
                 err = ret;
@@@ -2740,11 -2618,11 +2725,11 @@@ out
   
   const struct file_operations btrfs_file_operations = {
         .llseek         = btrfs_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = generic_file_aio_read,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = generic_file_read_iter,
         .splice_read    = generic_file_splice_read,
-       .aio_write      = btrfs_file_aio_write,
+       .write_iter     = btrfs_file_write_iter,
         .mmap           = btrfs_file_mmap,
         .open           = generic_file_open,
         .release        = btrfs_release_file,
diff --combined fs/btrfs/inode.c

index 7fa5f7fd7bc79259ed5a5e51131cbf6c8d07d919,c8386f1961f001586f5f1824e7e02d1b63bf4470..8925f66a14115c9d733182f2ec4d113be5be5edd
--- 1/fs/btrfs/inode.c
--- 2/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -125,7 -125,7 +125,7 @@@ static int btrfs_init_inode_security(st
    * the btree.  The caller should have done a btrfs_drop_extents so that
    * no overlapping inline items exist in the btree
    */
- -static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
+ +static int insert_inline_extent(struct btrfs_trans_handle *trans,
                                 struct btrfs_path *path, int extent_inserted,
                                 struct btrfs_root *root, struct inode *inode,
                                 u64 start, size_t size, size_t compressed_size,
@@@ -2678,7 -2678,6 +2678,7 @@@ static int btrfs_finish_ordered_io(stru
                 trans = NULL;
                 goto out_unlock;
         }
+ +
         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
   
         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
@@@ -2948,15 -2947,14 +2948,15 @@@ void btrfs_orphan_commit_root(struct bt
         root->orphan_block_rsv = NULL;
         spin_unlock(&root->orphan_lock);
   
- -      if (root->orphan_item_inserted &&
+ +      if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
             btrfs_root_refs(&root->root_item) > 0) {
                 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
                                             root->root_key.objectid);
                 if (ret)
                         btrfs_abort_transaction(trans, root, ret);
                 else
- -                      root->orphan_item_inserted = 0;
+ +                      clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
+ +                                &root->state);
         }
   
         if (block_rsv) {
@@@ -3273,8 -3271,7 +3273,8 @@@ int btrfs_orphan_cleanup(struct btrfs_r
                 btrfs_block_rsv_release(root, root->orphan_block_rsv,
                                         (u64)-1);
   
- -      if (root->orphan_block_rsv || root->orphan_item_inserted) {
+ +      if (root->orphan_block_rsv ||
+ +          test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
                 trans = btrfs_join_transaction(root);
                 if (!IS_ERR(trans))
                         btrfs_end_transaction(trans, root);
@@@ -3476,7 -3473,7 +3476,7 @@@ cache_acl
                 ret = btrfs_load_inode_props(inode, path);
                 if (ret)
                         btrfs_err(root->fs_info,
- -                                "error loading props for ino %llu (root %llu): %d\n",
+ +                                "error loading props for ino %llu (root %llu): %d",
                                   btrfs_ino(inode),
                                   root->root_key.objectid, ret);
         }
@@@ -4001,8 -3998,7 +4001,8 @@@ int btrfs_truncate_inode_items(struct b
          * not block aligned since we will be keeping the last block of the
          * extent just the way it is.
          */
- -      if (root->ref_cows || root == root->fs_info->tree_root)
+ +      if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ +          root == root->fs_info->tree_root)
                 btrfs_drop_extent_cache(inode, ALIGN(new_size,
                                         root->sectorsize), (u64)-1, 0);
   
@@@ -4095,9 -4091,7 +4095,9 @@@ search_again
                                                          extent_num_bytes);
                                 num_dec = (orig_num_bytes -
                                            extent_num_bytes);
- -                              if (root->ref_cows && extent_start != 0)
+ +                              if (test_bit(BTRFS_ROOT_REF_COWS,
+ +                                           &root->state) &&
+ +                                  extent_start != 0)
                                         inode_sub_bytes(inode, num_dec);
                                 btrfs_mark_buffer_dirty(leaf);
                         } else {
@@@ -4111,8 -4105,7 +4111,8 @@@
                                 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
                                 if (extent_start != 0) {
                                         found_extent = 1;
- -                                      if (root->ref_cows)
+ +                                      if (test_bit(BTRFS_ROOT_REF_COWS,
+ +                                                   &root->state))
                                                 inode_sub_bytes(inode, num_dec);
                                 }
                         }
@@@ -4127,9 -4120,10 +4127,9 @@@
                             btrfs_file_extent_other_encoding(leaf, fi) == 0) {
                                 u32 size = new_size - found_key.offset;
   
- -                              if (root->ref_cows) {
+ +                              if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
                                         inode_sub_bytes(inode, item_end + 1 -
                                                         new_size);
- -                              }
   
                                 /*
                                  * update the ram bytes to properly reflect
@@@ -4139,8 -4133,7 +4139,8 @@@
                                 size =
                                     btrfs_file_extent_calc_inline_size(size);
                                 btrfs_truncate_item(root, path, size, 1);
- -                      } else if (root->ref_cows) {
+ +                      } else if (test_bit(BTRFS_ROOT_REF_COWS,
+ +                                          &root->state)) {
                                 inode_sub_bytes(inode, item_end + 1 -
                                                 found_key.offset);
                         }
@@@ -4162,9 -4155,8 +4162,9 @@@ delete
                 } else {
                         break;
                 }
- -              if (found_extent && (root->ref_cows ||
- -                                   root == root->fs_info->tree_root)) {
+ +              if (found_extent &&
+ +                  (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ +                   root == root->fs_info->tree_root)) {
                         btrfs_set_path_blocking(path);
                         ret = btrfs_free_extent(trans, root, extent_start,
                                                 extent_num_bytes, 0,
@@@ -5176,7 -5168,8 +5176,7 @@@ static int btrfs_dentry_delete(const st
   
   static void btrfs_dentry_release(struct dentry *dentry)
   {
- -      if (dentry->d_fsdata)
- -              kfree(dentry->d_fsdata);
+ +      kfree(dentry->d_fsdata);
   }
   
   static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
@@@ -5560,7 -5553,6 +5560,7 @@@ static struct inode *btrfs_new_inode(st
         struct btrfs_inode_ref *ref;
         struct btrfs_key key[2];
         u32 sizes[2];
+ +      int nitems = name ? 2 : 1;
         unsigned long ptr;
         int ret;
   
@@@ -5580,7 -5572,7 +5580,7 @@@
          */
         inode->i_ino = objectid;
   
- -      if (dir) {
+ +      if (dir && name) {
                 trace_btrfs_inode_request(dir);
   
                 ret = btrfs_set_inode_index(dir, index);
@@@ -5589,8 -5581,6 +5589,8 @@@
                         iput(inode);
                         return ERR_PTR(ret);
                 }
+ +      } else if (dir) {
+ +              *index = 0;
         }
         /*
          * index_cnt is ignored for everything but a dir,
@@@ -5615,24 -5605,21 +5615,24 @@@
         btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
         key[0].offset = 0;
   
- -      /*
- -       * Start new inodes with an inode_ref. This is slightly more
- -       * efficient for small numbers of hard links since they will
- -       * be packed into one item. Extended refs will kick in if we
- -       * add more hard links than can fit in the ref item.
- -       */
- -      key[1].objectid = objectid;
- -      btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
- -      key[1].offset = ref_objectid;
- -
         sizes[0] = sizeof(struct btrfs_inode_item);
- -      sizes[1] = name_len + sizeof(*ref);
+ +
+ +      if (name) {
+ +              /*
+ +               * Start new inodes with an inode_ref. This is slightly more
+ +               * efficient for small numbers of hard links since they will
+ +               * be packed into one item. Extended refs will kick in if we
+ +               * add more hard links than can fit in the ref item.
+ +               */
+ +              key[1].objectid = objectid;
+ +              btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+ +              key[1].offset = ref_objectid;
+ +
+ +              sizes[1] = name_len + sizeof(*ref);
+ +      }
   
         path->leave_spinning = 1;
- -      ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
+ +      ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
         if (ret != 0)
                 goto fail;
   
@@@ -5645,14 -5632,12 +5645,14 @@@
                              sizeof(*inode_item));
         fill_inode_item(trans, path->nodes[0], inode_item, inode);
   
- -      ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
- -                           struct btrfs_inode_ref);
- -      btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
- -      btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
- -      ptr = (unsigned long)(ref + 1);
- -      write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ +      if (name) {
+ +              ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+ +                                   struct btrfs_inode_ref);
+ +              btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ +              btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
+ +              ptr = (unsigned long)(ref + 1);
+ +              write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ +      }
   
         btrfs_mark_buffer_dirty(path->nodes[0]);
         btrfs_free_path(path);
@@@ -5688,7 -5673,7 +5688,7 @@@
   
         return inode;
   fail:
- -      if (dir)
+ +      if (dir && name)
                 BTRFS_I(dir)->index_cnt--;
         btrfs_free_path(path);
         iput(inode);
@@@ -5973,15 -5958,6 +5973,15 @@@ static int btrfs_link(struct dentry *ol
                 err = btrfs_update_inode(trans, root, inode);
                 if (err)
                         goto fail;
+ +              if (inode->i_nlink == 1) {
+ +                      /*
+ +                       * If new hard link count is 1, it's a file created
+ +                       * with open(2) O_TMPFILE flag.
+ +                       */
+ +                      err = btrfs_orphan_del(trans, inode);
+ +                      if (err)
+ +                              goto fail;
+ +              }
                 d_instantiate(dentry, inode);
                 btrfs_log_new_name(trans, inode, NULL, parent);
         }
@@@ -6110,8 -6086,16 +6110,8 @@@ static noinline int uncompress_inline(s
         max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
         ret = btrfs_decompress(compress_type, tmp, page,
                                extent_offset, inline_size, max_size);
- -      if (ret) {
- -              char *kaddr = kmap_atomic(page);
- -              unsigned long copy_size = min_t(u64,
- -                                PAGE_CACHE_SIZE - pg_offset,
- -                                max_size - extent_offset);
- -              memset(kaddr + pg_offset, 0, copy_size);
- -              kunmap_atomic(kaddr);
- -      }
         kfree(tmp);
- -      return 0;
+ +      return ret;
   }
   
   /*
@@@ -6129,6 -6113,7 +6129,6 @@@ struct extent_map *btrfs_get_extent(str
   {
         int ret;
         int err = 0;
- -      u64 bytenr;
         u64 extent_start = 0;
         u64 extent_end = 0;
         u64 objectid = btrfs_ino(inode);
@@@ -6142,7 -6127,7 +6142,7 @@@
         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
         struct btrfs_trans_handle *trans = NULL;
- -      int compress_type;
+ +      const bool new_inline = !page || create;
   
   again:
         read_lock(&em_tree->lock);
@@@ -6216,6 -6201,7 +6216,6 @@@
   
         found_type = btrfs_file_extent_type(leaf, item);
         extent_start = found_key.offset;
- -      compress_type = btrfs_file_extent_compression(leaf, item);
         if (found_type == BTRFS_FILE_EXTENT_REG ||
             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
                 extent_end = extent_start +
@@@ -6250,10 -6236,32 +6250,10 @@@ next
                 goto not_found_em;
         }
   
- -      em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
+ +      btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
+ +
         if (found_type == BTRFS_FILE_EXTENT_REG ||
             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
- -              em->start = extent_start;
- -              em->len = extent_end - extent_start;
- -              em->orig_start = extent_start -
- -                               btrfs_file_extent_offset(leaf, item);
- -              em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
- -                                                                    item);
- -              bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
- -              if (bytenr == 0) {
- -                      em->block_start = EXTENT_MAP_HOLE;
- -                      goto insert;
- -              }
- -              if (compress_type != BTRFS_COMPRESS_NONE) {
- -                      set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- -                      em->compress_type = compress_type;
- -                      em->block_start = bytenr;
- -                      em->block_len = em->orig_block_len;
- -              } else {
- -                      bytenr += btrfs_file_extent_offset(leaf, item);
- -                      em->block_start = bytenr;
- -                      em->block_len = em->len;
- -                      if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
- -                              set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
- -              }
                 goto insert;
         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
                 unsigned long ptr;
@@@ -6262,8 -6270,12 +6262,8 @@@
                 size_t extent_offset;
                 size_t copy_size;
   
- -              em->block_start = EXTENT_MAP_INLINE;
- -              if (!page || create) {
- -                      em->start = extent_start;
- -                      em->len = extent_end - extent_start;
+ +              if (new_inline)
                         goto out;
- -              }
   
                 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
                 extent_offset = page_offset(page) + pg_offset - extent_start;
@@@ -6273,6 -6285,10 +6273,6 @@@
                 em->len = ALIGN(copy_size, root->sectorsize);
                 em->orig_block_len = em->len;
                 em->orig_start = em->start;
- -              if (compress_type) {
- -                      set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- -                      em->compress_type = compress_type;
- -              }
                 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
                 if (create == 0 && !PageUptodate(page)) {
                         if (btrfs_file_extent_compression(leaf, item) !=
@@@ -6280,10 -6296,7 +6280,10 @@@
                                 ret = uncompress_inline(path, inode, page,
                                                         pg_offset,
                                                         extent_offset, item);
- -                              BUG_ON(ret); /* -ENOMEM */
+ +                              if (ret) {
+ +                                      err = ret;
+ +                                      goto out;
+ +                              }
                         } else {
                                 map = kmap(page);
                                 read_extent_buffer(leaf, map + pg_offset, ptr,
@@@ -6319,6 -6332,8 +6319,6 @@@
                 set_extent_uptodate(io_tree, em->start,
                                     extent_map_end(em) - 1, NULL, GFP_NOFS);
                 goto insert;
- -      } else {
- -              WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
         }
   not_found:
         em->start = start;
@@@ -6702,76 -6717,6 +6702,76 @@@ out
         return ret;
   }
   
+ +bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
+ +{
+ +      struct radix_tree_root *root = &inode->i_mapping->page_tree;
+ +      int found = false;
+ +      void **pagep = NULL;
+ +      struct page *page = NULL;
+ +      int start_idx;
+ +      int end_idx;
+ +
+ +      start_idx = start >> PAGE_CACHE_SHIFT;
+ +
+ +      /*
+ +       * end is the last byte in the last page.  end == start is legal
+ +       */
+ +      end_idx = end >> PAGE_CACHE_SHIFT;
+ +
+ +      rcu_read_lock();
+ +
+ +      /* Most of the code in this while loop is lifted from
+ +       * find_get_page.  It's been modified to begin searching from a
+ +       * page and return just the first page found in that range.  If the
+ +       * found idx is less than or equal to the end idx then we know that
+ +       * a page exists.  If no pages are found or if those pages are
+ +       * outside of the range then we're fine (yay!) */
+ +      while (page == NULL &&
+ +             radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
+ +              page = radix_tree_deref_slot(pagep);
+ +              if (unlikely(!page))
+ +                      break;
+ +
+ +              if (radix_tree_exception(page)) {
+ +                      if (radix_tree_deref_retry(page)) {
+ +                              page = NULL;
+ +                              continue;
+ +                      }
+ +                      /*
+ +                       * Otherwise, shmem/tmpfs must be storing a swap entry
+ +                       * here as an exceptional entry: so return it without
+ +                       * attempting to raise page count.
+ +                       */
+ +                      page = NULL;
+ +                      break; /* TODO: Is this relevant for this use case? */
+ +              }
+ +
+ +              if (!page_cache_get_speculative(page)) {
+ +                      page = NULL;
+ +                      continue;
+ +              }
+ +
+ +              /*
+ +               * Has the page moved?
+ +               * This is part of the lockless pagecache protocol. See
+ +               * include/linux/pagemap.h for details.
+ +               */
+ +              if (unlikely(page != *pagep)) {
+ +                      page_cache_release(page);
+ +                      page = NULL;
+ +              }
+ +      }
+ +
+ +      if (page) {
+ +              if (page->index <= end_idx)
+ +                      found = true;
+ +              page_cache_release(page);
+ +      }
+ +
+ +      rcu_read_unlock();
+ +      return found;
+ +}
+ +
   static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                               struct extent_state **cached_state, int writing)
   {
@@@ -6796,9 -6741,10 +6796,9 @@@
                  * invalidate needs to happen so that reads after a write do not
                  * get stale data.
                  */
- -              if (!ordered && (!writing ||
- -                  !test_range_bit(&BTRFS_I(inode)->io_tree,
- -                                  lockstart, lockend, EXTENT_UPTODATE, 0,
- -                                  *cached_state)))
+ +              if (!ordered &&
+ +                  (!writing ||
+ +                   !btrfs_page_exists_in_range(inode, lockstart, lockend)))
                         break;
   
                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
@@@ -7180,7 -7126,7 +7180,7 @@@ static void btrfs_end_dio_bio(struct bi
                  * before atomic variable goto zero, we must make sure
                  * dip->errors is perceived to be set.
                  */
- -              smp_mb__before_atomic_dec();
+ +              smp_mb__before_atomic();
         }
   
         /* if there are more bios still pending for this dio, just exit */
@@@ -7360,7 -7306,7 +7360,7 @@@ out_err
          * before atomic variable goto zero, we must
          * make sure dip->errors is perceived to be set.
          */
- -      smp_mb__before_atomic_dec();
+ +      smp_mb__before_atomic();
         if (atomic_dec_and_test(&dip->pending_bios))
                 bio_io_error(dip->orig_bio);
   
@@@ -7445,39 -7391,30 +7445,30 @@@ free_ordered
   }
   
   static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
-                       const struct iovec *iov, loff_t offset,
-                       unsigned long nr_segs)
+                       const struct iov_iter *iter, loff_t offset)
   {
         int seg;
         int i;
-       size_t size;
-       unsigned long addr;
         unsigned blocksize_mask = root->sectorsize - 1;
         ssize_t retval = -EINVAL;
-       loff_t end = offset;
   
         if (offset & blocksize_mask)
                 goto out;
   
-       /* Check the memory alignment.  Blocks cannot straddle pages */
-       for (seg = 0; seg < nr_segs; seg++) {
-               addr = (unsigned long)iov[seg].iov_base;
-               size = iov[seg].iov_len;
-               end += size;
-               if ((addr & blocksize_mask) || (size & blocksize_mask))
-                       goto out;
- 
-               /* If this is a write we don't need to check anymore */
-               if (rw & WRITE)
-                       continue;
+       if (iov_iter_alignment(iter) & blocksize_mask)
+               goto out;
   
-               /*
-                * Check to make sure we don't have duplicate iov_base's in this
-                * iovec, if so return EINVAL, otherwise we'll get csum errors
-                * when reading back.
-                */
-               for (i = seg + 1; i < nr_segs; i++) {
-                       if (iov[seg].iov_base == iov[i].iov_base)
+       /* If this is a write we don't need to check anymore */
+       if (rw & WRITE)
+               return 0;
+       /*
+        * Check to make sure we don't have duplicate iov_base's in this
+        * iovec, if so return EINVAL, otherwise we'll get csum errors
+        * when reading back.
+        */
+       for (seg = 0; seg < iter->nr_segs; seg++) {
+               for (i = seg + 1; i < iter->nr_segs; i++) {
+                       if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
                                 goto out;
                 }
         }
@@@ -7487,8 -7424,7 +7478,7 @@@ out
   }
   
   static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
-                       const struct iovec *iov, loff_t offset,
-                       unsigned long nr_segs)
+                       struct iov_iter *iter, loff_t offset)
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
@@@ -7498,12 -7434,11 +7488,11 @@@
         bool relock = false;
         ssize_t ret;
   
-       if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
-                           offset, nr_segs))
+       if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
                 return 0;
   
         atomic_inc(&inode->i_dio_count);
- -      smp_mb__after_atomic_inc();
+ +      smp_mb__after_atomic();
   
         /*
          * The generic stuff only does filemap_write_and_wait_range, which
@@@ -7511,7 -7446,7 +7500,7 @@@
          * we need to flush the dirty pages again to make absolutely sure
          * that any outstanding dirty pages are on disk.
          */
-       count = iov_length(iov, nr_segs);
+       count = iov_iter_count(iter);
         if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
                      &BTRFS_I(inode)->runtime_flags))
                 filemap_fdatawrite_range(inode->i_mapping, offset, count);
@@@ -7538,7 -7473,7 +7527,7 @@@
   
         ret = __blockdev_direct_IO(rw, iocb, inode,
                         BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
-                       iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+                       iter, offset, btrfs_get_blocks_direct, NULL,
                         btrfs_submit_direct, flags);
         if (rw & WRITE) {
                 if (ret < 0 && ret != -EIOCBQUEUED)
@@@ -8046,7 -7981,7 +8035,7 @@@ int btrfs_create_subvol_root(struct btr
         err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
         if (err)
                 btrfs_err(new_root->fs_info,
- -                        "error inheriting subvolume %llu properties: %d\n",
+ +                        "error inheriting subvolume %llu properties: %d",
                           new_root->root_key.objectid, err);
   
         err = btrfs_update_inode(trans, new_root, inode);
@@@ -8365,7 -8300,7 +8354,7 @@@ static int btrfs_rename(struct inode *o
         BTRFS_I(old_inode)->dir_index = 0ULL;
         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
                 /* force full log commit if subvolume involved. */
- -              root->fs_info->last_trans_log_full_commit = trans->transid;
+ +              btrfs_set_log_full_commit(root->fs_info, trans);
         } else {
                 ret = btrfs_insert_inode_ref(trans, dest,
                                              new_dentry->d_name.name,
@@@ -8943,66 -8878,6 +8932,66 @@@ static int btrfs_permission(struct inod
         return generic_permission(inode, mask);
   }
   
+ +static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+ +{
+ +      struct btrfs_trans_handle *trans;
+ +      struct btrfs_root *root = BTRFS_I(dir)->root;
+ +      struct inode *inode = NULL;
+ +      u64 objectid;
+ +      u64 index;
+ +      int ret = 0;
+ +
+ +      /*
+ +       * 5 units required for adding orphan entry
+ +       */
+ +      trans = btrfs_start_transaction(root, 5);
+ +      if (IS_ERR(trans))
+ +              return PTR_ERR(trans);
+ +
+ +      ret = btrfs_find_free_ino(root, &objectid);
+ +      if (ret)
+ +              goto out;
+ +
+ +      inode = btrfs_new_inode(trans, root, dir, NULL, 0,
+ +                              btrfs_ino(dir), objectid, mode, &index);
+ +      if (IS_ERR(inode)) {
+ +              ret = PTR_ERR(inode);
+ +              inode = NULL;
+ +              goto out;
+ +      }
+ +
+ +      ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+ +      if (ret)
+ +              goto out;
+ +
+ +      ret = btrfs_update_inode(trans, root, inode);
+ +      if (ret)
+ +              goto out;
+ +
+ +      inode->i_fop = &btrfs_file_operations;
+ +      inode->i_op = &btrfs_file_inode_operations;
+ +
+ +      inode->i_mapping->a_ops = &btrfs_aops;
+ +      inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ +      BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ +
+ +      ret = btrfs_orphan_add(trans, inode);
+ +      if (ret)
+ +              goto out;
+ +
+ +      d_tmpfile(dentry, inode);
+ +      mark_inode_dirty(inode);
+ +
+ +out:
+ +      btrfs_end_transaction(trans, root);
+ +      if (ret)
+ +              iput(inode);
+ +      btrfs_balance_delayed_items(root);
+ +      btrfs_btree_balance_dirty(root);
+ +
+ +      return ret;
+ +}
+ +
   static const struct inode_operations btrfs_dir_inode_operations = {
         .getattr        = btrfs_getattr,
         .lookup         = btrfs_lookup,
@@@ -9023,7 -8898,6 +9012,7 @@@
         .get_acl        = btrfs_get_acl,
         .set_acl        = btrfs_set_acl,
         .update_time    = btrfs_update_time,
+ +      .tmpfile        = btrfs_tmpfile,
   };
   static const struct inode_operations btrfs_dir_ro_inode_operations = {
         .lookup         = btrfs_lookup,
diff --combined fs/ceph/addr.c

index 65a30e817dd80ab9c7264ade89b9cae563998465,342ca5e423f9bb22ecbd1974778eb99c99d5ae6b..4f3f69079f362280379edf3b13c4766247c764fa
--- 1/fs/ceph/addr.c
--- 2/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@@ -694,7 -694,7 +694,7 @@@ static int ceph_writepages_start(struc
              (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
   
         if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
- -              pr_warning("writepage_start %p on forced umount\n", inode);
+ +              pr_warn("writepage_start %p on forced umount\n", inode);
                 return -EIO; /* we're in a forced umount, don't write! */
         }
         if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
@@@ -1187,8 -1187,8 +1187,8 @@@ static int ceph_write_end(struct file *
    * never get called.
    */
   static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
-                             const struct iovec *iov,
-                             loff_t pos, unsigned long nr_segs)
+                             struct iov_iter *iter,
+                             loff_t pos)
   {
         WARN_ON(1);
         return -EINVAL;
diff --combined fs/cifs/cifsfs.c

index 6aaa8112c538a73c82b15eaf8dd733abd21f39f0,496b520934e01adafd7a9a1d2ae90549ca008433..2c90d07c0b3aa3a6db836e0290fd0ecc2137b317
--- 1/fs/cifs/cifsfs.c
--- 2/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@@ -87,6 -87,10 +87,6 @@@ extern mempool_t *cifs_mid_poolp
   
   struct workqueue_struct       *cifsiod_wq;
   
- -#ifdef CONFIG_CIFS_SMB2
- -__u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE];
- -#endif
- -
   /*
    * Bumps refcount for cifs super block.
    * Note that it should be only called if a referece to VFS super block is
@@@ -247,7 -251,11 +247,7 @@@ cifs_alloc_inode(struct super_block *sb
          * server, can not assume caching of file data or metadata.
          */
         cifs_set_oplock_level(cifs_inode, 0);
- -      cifs_inode->delete_pending = false;
- -      cifs_inode->invalid_mapping = false;
- -      clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cifs_inode->flags);
- -      clear_bit(CIFS_INODE_PENDING_WRITERS, &cifs_inode->flags);
- -      clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cifs_inode->flags);
+ +      cifs_inode->flags = 0;
         spin_lock_init(&cifs_inode->writers_lock);
         cifs_inode->writers = 0;
         cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
@@@ -294,7 -302,7 +294,7 @@@ cifs_show_address(struct seq_file *s, s
         struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
         struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
   
- -      seq_printf(s, ",addr=");
+ +      seq_puts(s, ",addr=");
   
         switch (server->dstaddr.ss_family) {
         case AF_INET:
@@@ -306,7 -314,7 +306,7 @@@
                         seq_printf(s, "%%%u", sa6->sin6_scope_id);
                 break;
         default:
- -              seq_printf(s, "(unknown)");
+ +              seq_puts(s, "(unknown)");
         }
   }
   
@@@ -316,45 -324,45 +316,45 @@@ cifs_show_security(struct seq_file *s, 
         if (ses->sectype == Unspecified)
                 return;
   
- -      seq_printf(s, ",sec=");
+ +      seq_puts(s, ",sec=");
   
         switch (ses->sectype) {
         case LANMAN:
- -              seq_printf(s, "lanman");
+ +              seq_puts(s, "lanman");
                 break;
         case NTLMv2:
- -              seq_printf(s, "ntlmv2");
+ +              seq_puts(s, "ntlmv2");
                 break;
         case NTLM:
- -              seq_printf(s, "ntlm");
+ +              seq_puts(s, "ntlm");
                 break;
         case Kerberos:
- -              seq_printf(s, "krb5");
+ +              seq_puts(s, "krb5");
                 break;
         case RawNTLMSSP:
- -              seq_printf(s, "ntlmssp");
+ +              seq_puts(s, "ntlmssp");
                 break;
         default:
                 /* shouldn't ever happen */
- -              seq_printf(s, "unknown");
+ +              seq_puts(s, "unknown");
                 break;
         }
   
         if (ses->sign)
- -              seq_printf(s, "i");
+ +              seq_puts(s, "i");
   }
   
   static void
   cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb)
   {
- -      seq_printf(s, ",cache=");
+ +      seq_puts(s, ",cache=");
   
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
- -              seq_printf(s, "strict");
+ +              seq_puts(s, "strict");
         else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
- -              seq_printf(s, "none");
+ +              seq_puts(s, "none");
         else
- -              seq_printf(s, "loose");
+ +              seq_puts(s, "loose");
   }
   
   static void
@@@ -387,7 -395,7 +387,7 @@@ cifs_show_options(struct seq_file *s, s
         cifs_show_cache_flavor(s, cifs_sb);
   
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
- -              seq_printf(s, ",multiuser");
+ +              seq_puts(s, ",multiuser");
         else if (tcon->ses->user_name)
                 seq_printf(s, ",username=%s", tcon->ses->user_name);
   
@@@ -413,16 -421,16 +413,16 @@@
         seq_printf(s, ",uid=%u",
                    from_kuid_munged(&init_user_ns, cifs_sb->mnt_uid));
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
- -              seq_printf(s, ",forceuid");
+ +              seq_puts(s, ",forceuid");
         else
- -              seq_printf(s, ",noforceuid");
+ +              seq_puts(s, ",noforceuid");
   
         seq_printf(s, ",gid=%u",
                    from_kgid_munged(&init_user_ns, cifs_sb->mnt_gid));
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
- -              seq_printf(s, ",forcegid");
+ +              seq_puts(s, ",forcegid");
         else
- -              seq_printf(s, ",noforcegid");
+ +              seq_puts(s, ",noforcegid");
   
         cifs_show_address(s, tcon->ses->server);
   
@@@ -434,47 -442,47 +434,47 @@@
         cifs_show_nls(s, cifs_sb->local_nls);
   
         if (tcon->seal)
- -              seq_printf(s, ",seal");
+ +              seq_puts(s, ",seal");
         if (tcon->nocase)
- -              seq_printf(s, ",nocase");
+ +              seq_puts(s, ",nocase");
         if (tcon->retry)
- -              seq_printf(s, ",hard");
+ +              seq_puts(s, ",hard");
         if (tcon->unix_ext)
- -              seq_printf(s, ",unix");
+ +              seq_puts(s, ",unix");
         else
- -              seq_printf(s, ",nounix");
+ +              seq_puts(s, ",nounix");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
- -              seq_printf(s, ",posixpaths");
+ +              seq_puts(s, ",posixpaths");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
- -              seq_printf(s, ",setuids");
+ +              seq_puts(s, ",setuids");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
- -              seq_printf(s, ",serverino");
+ +              seq_puts(s, ",serverino");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
- -              seq_printf(s, ",rwpidforward");
+ +              seq_puts(s, ",rwpidforward");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL)
- -              seq_printf(s, ",forcemand");
+ +              seq_puts(s, ",forcemand");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
- -              seq_printf(s, ",nouser_xattr");
+ +              seq_puts(s, ",nouser_xattr");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
- -              seq_printf(s, ",mapchars");
+ +              seq_puts(s, ",mapchars");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
- -              seq_printf(s, ",sfu");
+ +              seq_puts(s, ",sfu");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
- -              seq_printf(s, ",nobrl");
+ +              seq_puts(s, ",nobrl");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
- -              seq_printf(s, ",cifsacl");
+ +              seq_puts(s, ",cifsacl");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
- -              seq_printf(s, ",dynperm");
+ +              seq_puts(s, ",dynperm");
         if (root->d_sb->s_flags & MS_POSIXACL)
- -              seq_printf(s, ",acl");
+ +              seq_puts(s, ",acl");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
- -              seq_printf(s, ",mfsymlinks");
+ +              seq_puts(s, ",mfsymlinks");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
- -              seq_printf(s, ",fsc");
+ +              seq_puts(s, ",fsc");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)
- -              seq_printf(s, ",nostrictsync");
+ +              seq_puts(s, ",nostrictsync");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
- -              seq_printf(s, ",noperm");
+ +              seq_puts(s, ",noperm");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID)
                 seq_printf(s, ",backupuid=%u",
                            from_kuid_munged(&init_user_ns,
@@@ -725,8 -733,7 +725,7 @@@ out_nls
         goto out;
   }
   
- static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                                  unsigned long nr_segs, loff_t pos)
+ static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
   {
         struct inode *inode = file_inode(iocb->ki_filp);
         struct cifsInodeInfo *cinode = CIFS_I(inode);
@@@ -737,14 -744,14 +736,14 @@@
         if (written)
                 return written;
   
-       written = generic_file_aio_write(iocb, iov, nr_segs, pos);
+       written = generic_file_write_iter(iocb, from);
   
         if (CIFS_CACHE_WRITE(CIFS_I(inode)))
                 goto out;
   
         rc = filemap_fdatawrite(inode->i_mapping);
         if (rc)
-               cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n",
+               cifs_dbg(FYI, "cifs_file_write_iter: %d rc on %p inode\n",
                          rc, inode);
   
   out:
@@@ -880,10 -887,10 +879,10 @@@ const struct inode_operations cifs_syml
   };
   
   const struct file_operations cifs_file_ops = {
-       .read = do_sync_read,
-       .write = do_sync_write,
-       .aio_read = generic_file_aio_read,
-       .aio_write = cifs_file_aio_write,
+       .read = new_sync_read,
+       .write = new_sync_write,
+       .read_iter = generic_file_read_iter,
+       .write_iter = cifs_file_write_iter,
         .open = cifs_open,
         .release = cifs_close,
         .lock = cifs_lock,
@@@ -899,10 -906,10 +898,10 @@@
   };
   
   const struct file_operations cifs_file_strict_ops = {
-       .read = do_sync_read,
-       .write = do_sync_write,
-       .aio_read = cifs_strict_readv,
-       .aio_write = cifs_strict_writev,
+       .read = new_sync_read,
+       .write = new_sync_write,
+       .read_iter = cifs_strict_readv,
+       .write_iter = cifs_strict_writev,
         .open = cifs_open,
         .release = cifs_close,
         .lock = cifs_lock,
@@@ -919,10 -926,10 +918,10 @@@
   
   const struct file_operations cifs_file_direct_ops = {
         /* BB reevaluate whether they can be done with directio, no cache */
-       .read = do_sync_read,
-       .write = do_sync_write,
-       .aio_read = cifs_user_readv,
-       .aio_write = cifs_user_writev,
+       .read = new_sync_read,
+       .write = new_sync_write,
+       .read_iter = cifs_user_readv,
+       .write_iter = cifs_user_writev,
         .open = cifs_open,
         .release = cifs_close,
         .lock = cifs_lock,
@@@ -938,10 -945,10 +937,10 @@@
   };
   
   const struct file_operations cifs_file_nobrl_ops = {
-       .read = do_sync_read,
-       .write = do_sync_write,
-       .aio_read = generic_file_aio_read,
-       .aio_write = cifs_file_aio_write,
+       .read = new_sync_read,
+       .write = new_sync_write,
+       .read_iter = generic_file_read_iter,
+       .write_iter = cifs_file_write_iter,
         .open = cifs_open,
         .release = cifs_close,
         .fsync = cifs_fsync,
@@@ -956,10 -963,10 +955,10 @@@
   };
   
   const struct file_operations cifs_file_strict_nobrl_ops = {
-       .read = do_sync_read,
-       .write = do_sync_write,
-       .aio_read = cifs_strict_readv,
-       .aio_write = cifs_strict_writev,
+       .read = new_sync_read,
+       .write = new_sync_write,
+       .read_iter = cifs_strict_readv,
+       .write_iter = cifs_strict_writev,
         .open = cifs_open,
         .release = cifs_close,
         .fsync = cifs_strict_fsync,
@@@ -975,10 -982,10 +974,10 @@@
   
   const struct file_operations cifs_file_direct_nobrl_ops = {
         /* BB reevaluate whether they can be done with directio, no cache */
-       .read = do_sync_read,
-       .write = do_sync_write,
-       .aio_read = cifs_user_readv,
-       .aio_write = cifs_user_writev,
+       .read = new_sync_read,
+       .write = new_sync_write,
+       .read_iter = cifs_user_readv,
+       .write_iter = cifs_user_writev,
         .open = cifs_open,
         .release = cifs_close,
         .fsync = cifs_fsync,
@@@ -1184,6 -1191,10 +1183,6 @@@ init_cifs(void
         spin_lock_init(&cifs_file_list_lock);
         spin_lock_init(&GlobalMid_Lock);
   
- -#ifdef CONFIG_CIFS_SMB2
- -      get_random_bytes(cifs_client_guid, SMB2_CLIENT_GUID_SIZE);
- -#endif
- -
         if (cifs_max_pending < 2) {
                 cifs_max_pending = 2;
                 cifs_dbg(FYI, "cifs_max_pending set to min of 2\n");
diff --combined fs/cifs/cifsfs.h

index 8fe51166d6e3192bb8aadf2d86f5a0acf494622a,c9e91886f0cfd08d108662d60d3d38fdf26c11f9..70f178a7c759525a17fc758e0637bf1d0c941ead
--- 1/fs/cifs/cifsfs.h
--- 2/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@@ -22,28 -22,20 +22,28 @@@
   #ifndef _CIFSFS_H
   #define _CIFSFS_H
   
+ +#include <linux/hash.h>
+ +
   #define ROOT_I 2
   
   /*
    * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
- - * so that it will fit.
+ + * so that it will fit. We use hash_64 to convert the value to 31 bits, and
+ + * then add 1, to ensure that we don't end up with a 0 as the value.
    */
+ +#if BITS_PER_LONG == 64
   static inline ino_t
   cifs_uniqueid_to_ino_t(u64 fileid)
   {
- -      ino_t ino = (ino_t) fileid;
- -      if (sizeof(ino_t) < sizeof(u64))
- -              ino ^= fileid >> (sizeof(u64)-sizeof(ino_t)) * 8;
- -      return ino;
+ +      return (ino_t)fileid;
   }
+ +#else
+ +static inline ino_t
+ +cifs_uniqueid_to_ino_t(u64 fileid)
+ +{
+ +      return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
+ +}
+ +#endif
   
   extern struct file_system_type cifs_fs_type;
   extern const struct address_space_operations cifs_addr_ops;
@@@ -75,8 -67,6 +75,8 @@@ extern int cifs_revalidate_dentry_attr(
   extern int cifs_revalidate_file(struct file *filp);
   extern int cifs_revalidate_dentry(struct dentry *);
   extern int cifs_invalidate_mapping(struct inode *inode);
+ +extern int cifs_revalidate_mapping(struct inode *inode);
+ +extern int cifs_zap_mapping(struct inode *inode);
   extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
   extern int cifs_setattr(struct dentry *, struct iattr *);
   
@@@ -95,14 -85,10 +95,10 @@@ extern const struct file_operations cif
   extern int cifs_open(struct inode *inode, struct file *file);
   extern int cifs_close(struct inode *inode, struct file *file);
   extern int cifs_closedir(struct inode *inode, struct file *file);
- extern ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
-                              unsigned long nr_segs, loff_t pos);
- extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
-                                unsigned long nr_segs, loff_t pos);
- extern ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
-                               unsigned long nr_segs, loff_t pos);
- extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
-                                 unsigned long nr_segs, loff_t pos);
+ extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
+ extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
+ extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
+ extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
   extern int cifs_lock(struct file *, int, struct file_lock *);
   extern int cifs_fsync(struct file *, loff_t, loff_t, int);
   extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int);
@@@ -140,5 -126,5 +136,5 @@@ extern long cifs_ioctl(struct file *fil
   extern const struct export_operations cifs_export_ops;
   #endif /* CONFIG_CIFS_NFSD_EXPORT */
   
- -#define CIFS_VERSION   "2.02"
+ +#define CIFS_VERSION   "2.03"
   #endif                                /* _CIFSFS_H */
diff --combined fs/cifs/file.c

index 208f56eca4bf4de164d8af873b0050ac4884c5ea,60e9b5fa22128fc7a127a5859f0c6070aae09c36..e90a1e9aa627642c9ccefd428319f43b3d379c2f
--- 1/fs/cifs/file.c
--- 2/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@@ -335,7 -335,7 +335,7 @@@ cifs_new_fileinfo(struct cifs_fid *fid
         spin_unlock(&cifs_file_list_lock);
   
         if (fid->purge_cache)
- -              cifs_invalidate_mapping(inode);
+ +              cifs_zap_mapping(inode);
   
         file->private_data = cfile;
         return cfile;
@@@ -392,7 -392,7 +392,7 @@@ void cifsFileInfo_put(struct cifsFileIn
                  * again and get at least level II oplock.
                  */
                 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
- -                      CIFS_I(inode)->invalid_mapping = true;
+ +                      set_bit(CIFS_INO_INVALID_MAPPING, &cifsi->flags);
                 cifs_set_oplock_level(cifsi, 0);
         }
         spin_unlock(&cifs_file_list_lock);
@@@ -1529,7 -1529,7 +1529,7 @@@ cifs_setlk(struct file *file, struct fi
                  */
                 if (!CIFS_CACHE_WRITE(CIFS_I(inode)) &&
                                         CIFS_CACHE_READ(CIFS_I(inode))) {
- -                      cifs_invalidate_mapping(inode);
+ +                      cifs_zap_mapping(inode);
                         cifs_dbg(FYI, "Set no oplock for inode=%p due to mand locks\n",
                                  inode);
                         CIFS_I(inode)->oplock = 0;
@@@ -2218,7 -2218,7 +2218,7 @@@ int cifs_strict_fsync(struct file *file
                  file->f_path.dentry->d_name.name, datasync);
   
         if (!CIFS_CACHE_READ(CIFS_I(inode))) {
- -              rc = cifs_invalidate_mapping(inode);
+ +              rc = cifs_zap_mapping(inode);
                 if (rc) {
                         cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc);
                         rc = 0; /* don't care about it in fsync */
@@@ -2385,14 -2385,12 +2385,12 @@@ cifs_uncached_retry_writev(struct cifs_
   }
   
   static ssize_t
- cifs_iovec_write(struct file *file, const struct iovec *iov,
-                unsigned long nr_segs, loff_t *poffset)
+ cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
   {
         unsigned long nr_pages, i;
         size_t bytes, copied, len, cur_len;
         ssize_t total_written = 0;
         loff_t offset;
-       struct iov_iter it;
         struct cifsFileInfo *open_file;
         struct cifs_tcon *tcon;
         struct cifs_sb_info *cifs_sb;
@@@ -2401,14 -2399,16 +2399,16 @@@
         int rc;
         pid_t pid;
   
-       len = iov_length(iov, nr_segs);
-       if (!len)
-               return 0;
- 
+       len = iov_iter_count(from);
         rc = generic_write_checks(file, poffset, &len, 0);
         if (rc)
                 return rc;
   
+       if (!len)
+               return 0;
+ 
+       iov_iter_truncate(from, len);
+ 
         INIT_LIST_HEAD(&wdata_list);
         cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
         open_file = file->private_data;
@@@ -2424,7 -2424,6 +2424,6 @@@
         else
                 pid = current->tgid;
   
-       iov_iter_init(&it, iov, nr_segs, len, 0);
         do {
                 size_t save_len;
   
@@@ -2444,11 -2443,10 +2443,10 @@@
   
                 save_len = cur_len;
                 for (i = 0; i < nr_pages; i++) {
-                       bytes = min_t(const size_t, cur_len, PAGE_SIZE);
-                       copied = iov_iter_copy_from_user(wdata->pages[i], &it,
-                                                        0, bytes);
+                       bytes = min_t(size_t, cur_len, PAGE_SIZE);
+                       copied = copy_page_from_iter(wdata->pages[i], 0, bytes,
+                                                    from);
                         cur_len -= copied;
-                       iov_iter_advance(&it, copied);
                         /*
                          * If we didn't copy as much as we expected, then that
                          * may mean we trod into an unmapped area. Stop copying
@@@ -2546,11 -2544,11 +2544,11 @@@ restart_loop
         return total_written ? total_written : (ssize_t)rc;
   }
   
- ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
-                               unsigned long nr_segs, loff_t pos)
+ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
   {
         ssize_t written;
         struct inode *inode;
+       loff_t pos = iocb->ki_pos;
   
         inode = file_inode(iocb->ki_filp);
   
@@@ -2560,9 -2558,9 +2558,9 @@@
          * write request.
          */
   
-       written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos);
+       written = cifs_iovec_write(iocb->ki_filp, from, &pos);
         if (written > 0) {
- -              CIFS_I(inode)->invalid_mapping = true;
+ +              set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(inode)->flags);
                 iocb->ki_pos = pos;
         }
   
@@@ -2570,8 -2568,7 +2568,7 @@@
   }
   
   static ssize_t
- cifs_writev(struct kiocb *iocb, const struct iovec *iov,
-           unsigned long nr_segs, loff_t pos)
+ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
   {
         struct file *file = iocb->ki_filp;
         struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
@@@ -2589,10 -2586,10 +2586,10 @@@
         mutex_lock(&inode->i_mutex);
         if (file->f_flags & O_APPEND)
                 lock_pos = i_size_read(inode);
-       if (!cifs_find_lock_conflict(cfile, lock_pos, iov_length(iov, nr_segs),
+       if (!cifs_find_lock_conflict(cfile, lock_pos, iov_iter_count(from),
                                      server->vals->exclusive_lock_type, NULL,
                                      CIFS_WRITE_OP)) {
-               rc = __generic_file_aio_write(iocb, iov, nr_segs);
+               rc = __generic_file_write_iter(iocb, from);
                 mutex_unlock(&inode->i_mutex);
   
                 if (rc > 0) {
@@@ -2610,8 -2607,7 +2607,7 @@@
   }
   
   ssize_t
- cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
-                  unsigned long nr_segs, loff_t pos)
+ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from)
   {
         struct inode *inode = file_inode(iocb->ki_filp);
         struct cifsInodeInfo *cinode = CIFS_I(inode);
@@@ -2629,11 -2625,10 +2625,10 @@@
                 if (cap_unix(tcon->ses) &&
                 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
                   && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
-                       written = generic_file_aio_write(
-                                       iocb, iov, nr_segs, pos);
+                       written = generic_file_write_iter(iocb, from);
                         goto out;
                 }
-               written = cifs_writev(iocb, iov, nr_segs, pos);
+               written = cifs_writev(iocb, from);
                 goto out;
         }
         /*
@@@ -2642,14 -2637,14 +2637,14 @@@
          * affected pages because it may cause a error with mandatory locks on
          * these pages but not on the region from pos to ppos+len-1.
          */
-       written = cifs_user_writev(iocb, iov, nr_segs, pos);
+       written = cifs_user_writev(iocb, from);
         if (written > 0 && CIFS_CACHE_READ(cinode)) {
                 /*
                  * Windows 7 server can delay breaking level2 oplock if a write
                  * request comes - break it on the client to prevent reading
                  * an old data.
                  */
- -              cifs_invalidate_mapping(inode);
+ +              cifs_zap_mapping(inode);
                 cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n",
                          inode);
                 cinode->oplock = 0;
@@@ -2831,32 -2826,25 +2826,25 @@@ cifs_uncached_read_into_pages(struct TC
         return total_read > 0 ? total_read : result;
   }
   
- ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
-                              unsigned long nr_segs, loff_t pos)
+ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
   {
         struct file *file = iocb->ki_filp;
         ssize_t rc;
         size_t len, cur_len;
         ssize_t total_read = 0;
-       loff_t offset = pos;
+       loff_t offset = iocb->ki_pos;
         unsigned int npages;
         struct cifs_sb_info *cifs_sb;
         struct cifs_tcon *tcon;
         struct cifsFileInfo *open_file;
         struct cifs_readdata *rdata, *tmp;
         struct list_head rdata_list;
-       struct iov_iter to;
         pid_t pid;
   
-       if (!nr_segs)
-               return 0;
- 
-       len = iov_length(iov, nr_segs);
+       len = iov_iter_count(to);
         if (!len)
                 return 0;
   
-       iov_iter_init(&to, iov, nr_segs, len, 0);
- 
         INIT_LIST_HEAD(&rdata_list);
         cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
         open_file = file->private_data;
@@@ -2914,7 -2902,7 +2902,7 @@@ error
         if (!list_empty(&rdata_list))
                 rc = 0;
   
-       len = iov_iter_count(&to);
+       len = iov_iter_count(to);
         /* the loop below should proceed in the order of increasing offsets */
         list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
         again:
@@@ -2931,7 -2919,7 +2919,7 @@@
                                         goto again;
                                 }
                         } else {
-                               rc = cifs_readdata_to_iov(rdata, &to);
+                               rc = cifs_readdata_to_iov(rdata, to);
                         }
   
                 }
@@@ -2939,7 -2927,7 +2927,7 @@@
                 kref_put(&rdata->refcount, cifs_uncached_readdata_release);
         }
   
-       total_read = len - iov_iter_count(&to);
+       total_read = len - iov_iter_count(to);
   
         cifs_stats_bytes_read(tcon, total_read);
   
@@@ -2948,15 -2936,14 +2936,14 @@@
                 rc = 0;
   
         if (total_read) {
-               iocb->ki_pos = pos + total_read;
+               iocb->ki_pos += total_read;
                 return total_read;
         }
         return rc;
   }
   
   ssize_t
- cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
-                 unsigned long nr_segs, loff_t pos)
+ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
   {
         struct inode *inode = file_inode(iocb->ki_filp);
         struct cifsInodeInfo *cinode = CIFS_I(inode);
@@@ -2975,22 -2962,22 +2962,22 @@@
          * pos+len-1.
          */
         if (!CIFS_CACHE_READ(cinode))
-               return cifs_user_readv(iocb, iov, nr_segs, pos);
+               return cifs_user_readv(iocb, to);
   
         if (cap_unix(tcon->ses) &&
             (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
             ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
-               return generic_file_aio_read(iocb, iov, nr_segs, pos);
+               return generic_file_read_iter(iocb, to);
   
         /*
          * We need to hold the sem to be sure nobody modifies lock list
          * with a brlock that prevents reading.
          */
         down_read(&cinode->lock_sem);
-       if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
+       if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to),
                                      tcon->ses->server->vals->shared_lock_type,
                                      NULL, CIFS_READ_OP))
-               rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
+               rc = generic_file_read_iter(iocb, to);
         up_read(&cinode->lock_sem);
         return rc;
   }
@@@ -3112,7 -3099,7 +3099,7 @@@ int cifs_file_strict_mmap(struct file *
         xid = get_xid();
   
         if (!CIFS_CACHE_READ(CIFS_I(inode))) {
- -              rc = cifs_invalidate_mapping(inode);
+ +              rc = cifs_zap_mapping(inode);
                 if (rc)
                         return rc;
         }
@@@ -3670,7 -3657,7 +3657,7 @@@ void cifs_oplock_break(struct work_stru
                 if (!CIFS_CACHE_READ(cinode)) {
                         rc = filemap_fdatawait(inode->i_mapping);
                         mapping_set_error(inode->i_mapping, rc);
- -                      cifs_invalidate_mapping(inode);
+ +                      cifs_zap_mapping(inode);
                 }
                 cifs_dbg(FYI, "Oplock flush inode %p rc %d\n", inode, rc);
         }
@@@ -3703,8 -3690,8 +3690,8 @@@
    * Direct IO is not yet supported in the cached mode. 
    */
   static ssize_t
- cifs_direct_io(int rw, struct kiocb *iocb, const struct iovec *iov,
-                loff_t pos, unsigned long nr_segs)
+ cifs_direct_io(int rw, struct kiocb *iocb, struct iov_iter *iter,
+                loff_t pos)
   {
           /*
            * FIXME
diff --combined fs/dcache.c

index 1792d6075b4f80ced75e04d17c02b8d362b51aac,e99c6f529ba8bbd307bd5024cb723056b74afe17..06f65857a855725247c1190d243c0e19cccd8570
--- 1/fs/dcache.c
--- 2/fs/dcache.c
+++ b/fs/dcache.c
@@@ -150,7 -150,7 +150,7 @@@ static long get_nr_dentry_unused(void
         return sum < 0 ? 0 : sum;
   }
   
- -int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
+ +int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer,
                    size_t *lenp, loff_t *ppos)
   {
         dentry_stat.nr_dentry = get_nr_dentry();
@@@ -532,10 -532,12 +532,12 @@@ static inline struct dentry *lock_paren
         struct dentry *parent = dentry->d_parent;
         if (IS_ROOT(dentry))
                 return NULL;
+       if (unlikely((int)dentry->d_lockref.count < 0))
+               return NULL;
         if (likely(spin_trylock(&parent->d_lock)))
                 return parent;
-       spin_unlock(&dentry->d_lock);
         rcu_read_lock();
+       spin_unlock(&dentry->d_lock);
   again:
         parent = ACCESS_ONCE(dentry->d_parent);
         spin_lock(&parent->d_lock);
diff --combined fs/ext3/inode.c

index 695abe738a2409f4c32f4ef7d5749757d98b6f15,4d32133a76c4bbfd0393528f8ad463bf239bf354..2c6ccc49ba279cacf77fe6609fe44a50b970898c
--- 1/fs/ext3/inode.c
--- 2/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@@ -1716,17 -1716,17 +1716,17 @@@ static int ext3_journalled_writepage(st
         WARN_ON_ONCE(IS_RDONLY(inode) &&
                      !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
   
- -      if (ext3_journal_current_handle())
- -              goto no_write;
- -
         trace_ext3_journalled_writepage(page);
- -      handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
- -      if (IS_ERR(handle)) {
- -              ret = PTR_ERR(handle);
- -              goto no_write;
- -      }
- -
         if (!page_has_buffers(page) || PageChecked(page)) {
+ +              if (ext3_journal_current_handle())
+ +                      goto no_write;
+ +
+ +              handle = ext3_journal_start(inode,
+ +                                          ext3_writepage_trans_blocks(inode));
+ +              if (IS_ERR(handle)) {
+ +                      ret = PTR_ERR(handle);
+ +                      goto no_write;
+ +              }
                 /*
                  * It's mmapped pagecache.  Add buffers and journal it.  There
                  * doesn't seem much point in redirtying the page here.
@@@ -1749,18 -1749,17 +1749,18 @@@
                 atomic_set(&EXT3_I(inode)->i_datasync_tid,
                            handle->h_transaction->t_tid);
                 unlock_page(page);
+ +              err = ext3_journal_stop(handle);
+ +              if (!ret)
+ +                      ret = err;
         } else {
                 /*
- -               * It may be a page full of checkpoint-mode buffers.  We don't
- -               * really know unless we go poke around in the buffer_heads.
- -               * But block_write_full_page will do the right thing.
+ +               * It is a page full of checkpoint-mode buffers. Go and write
+ +               * them. They should have been already mapped when they went
+ +               * to the journal so provide NULL get_block function to catch
+ +               * errors.
                  */
- -              ret = block_write_full_page(page, ext3_get_block, wbc);
+ +              ret = block_write_full_page(page, NULL, wbc);
         }
- -      err = ext3_journal_stop(handle);
- -      if (!ret)
- -              ret = err;
   out:
         return ret;
   
@@@ -1821,8 -1820,7 +1821,7 @@@ static int ext3_releasepage(struct pag
    * VFS code falls back into buffered path in that case so we are safe.
    */
   static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
-                       const struct iovec *iov, loff_t offset,
-                       unsigned long nr_segs)
+                       struct iov_iter *iter, loff_t offset)
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
@@@ -1830,10 -1828,10 +1829,10 @@@
         handle_t *handle;
         ssize_t ret;
         int orphan = 0;
-       size_t count = iov_length(iov, nr_segs);
+       size_t count = iov_iter_count(iter);
         int retries = 0;
   
-       trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
+       trace_ext3_direct_IO_enter(inode, offset, count, rw);
   
         if (rw == WRITE) {
                 loff_t final_size = offset + count;
@@@ -1857,15 -1855,14 +1856,14 @@@
         }
   
   retry:
-       ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-                                ext3_get_block);
+       ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block);
         /*
          * In case of error extending write may have instantiated a few
          * blocks outside i_size. Trim these off again.
          */
         if (unlikely((rw & WRITE) && ret < 0)) {
                 loff_t isize = i_size_read(inode);
-               loff_t end = offset + iov_length(iov, nr_segs);
+               loff_t end = offset + count;
   
                 if (end > isize)
                         ext3_truncate_failed_direct_write(inode);
@@@ -1910,8 -1907,7 +1908,7 @@@
                         ret = err;
         }
   out:
-       trace_ext3_direct_IO_exit(inode, offset,
-                               iov_length(iov, nr_segs), rw, ret);
+       trace_ext3_direct_IO_exit(inode, offset, count, rw, ret);
         return ret;
   }
   
diff --combined fs/ext4/ext4.h

index 1479e2ae00d28e83e8d1c175752a61b59828cd55,eb37d76bf9116f02dc005a837655062a9099aa26..7cc5a0e23688e1a2ce071dcb646b725a266ff890
--- 1/fs/ext4/ext4.h
--- 2/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@@ -875,8 -875,6 +875,8 @@@ struct ext4_inode_info 
         struct inode vfs_inode;
         struct jbd2_inode *jinode;
   
+ +      spinlock_t i_raw_lock;  /* protects updates to the raw inode */
+ +
         /*
          * File creation time. Its function is same as that of
          * struct timespec i_{a,c,m}time in the generic inode.
@@@ -1160,8 -1158,7 +1160,8 @@@ struct ext4_super_block 
         __le32  s_usr_quota_inum;       /* inode for tracking user quota */
         __le32  s_grp_quota_inum;       /* inode for tracking group quota */
         __le32  s_overhead_clusters;    /* overhead blocks/clusters in fs */
- -      __le32  s_reserved[108];        /* Padding to the end of the block */
+ +      __le32  s_backup_bgs[2];        /* groups with sparse_super2 SBs */
+ +      __le32  s_reserved[106];        /* Padding to the end of the block */
         __le32  s_checksum;             /* crc32c(superblock) */
   };
   
@@@ -1507,7 -1504,6 +1507,7 @@@ static inline void ext4_clear_state_fla
   #define EXT4_FEATURE_COMPAT_EXT_ATTR          0x0008
   #define EXT4_FEATURE_COMPAT_RESIZE_INODE      0x0010
   #define EXT4_FEATURE_COMPAT_DIR_INDEX         0x0020
+ +#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2     0x0200
   
   #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER   0x0001
   #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE     0x0002
@@@ -1956,6 -1952,10 +1956,6 @@@ extern void ext4_get_group_no_and_offse
   extern ext4_group_t ext4_get_group_number(struct super_block *sb,
                                           ext4_fsblk_t block);
   
- -extern void ext4_validate_block_bitmap(struct super_block *sb,
- -                                     struct ext4_group_desc *desc,
- -                                     ext4_group_t block_group,
- -                                     struct buffer_head *bh);
   extern unsigned int ext4_block_group(struct super_block *sb,
                         ext4_fsblk_t blocknr);
   extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
@@@ -1984,9 -1984,16 +1984,9 @@@ extern int ext4_wait_block_bitmap(struc
                                   struct buffer_head *bh);
   extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
                                                   ext4_group_t block_group);
- -extern void ext4_init_block_bitmap(struct super_block *sb,
- -                                 struct buffer_head *bh,
- -                                 ext4_group_t group,
- -                                 struct ext4_group_desc *desc);
   extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
                                               ext4_group_t block_group,
                                               struct ext4_group_desc *gdp);
- -extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
- -                                         ext4_group_t block_group,
- -                                         struct ext4_group_desc *gdp);
   ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
   
   /* dir.c */
@@@ -2129,6 -2136,8 +2129,6 @@@ extern int ext4_alloc_da_blocks(struct 
   extern void ext4_set_aops(struct inode *inode);
   extern int ext4_writepage_trans_blocks(struct inode *);
   extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
- -extern int ext4_block_truncate_page(handle_t *handle,
- -              struct address_space *mapping, loff_t from);
   extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                              loff_t lstart, loff_t lend);
   extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
@@@ -2140,8 -2149,7 +2140,7 @@@ extern void ext4_da_update_reserve_spac
   extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
                                 struct ext4_map_blocks *map, int flags);
   extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
-                               const struct iovec *iov, loff_t offset,
-                               unsigned long nr_segs);
+                               struct iov_iter *iter, loff_t offset);
   extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
   extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
   extern void ext4_ind_truncate(handle_t *, struct inode *inode);
@@@ -2188,6 -2196,8 +2187,6 @@@ extern int ext4_resize_fs(struct super_
   
   /* super.c */
   extern int ext4_calculate_overhead(struct super_block *sb);
- -extern int ext4_superblock_csum_verify(struct super_block *sb,
- -                                     struct ext4_super_block *es);
   extern void ext4_superblock_csum_set(struct super_block *sb);
   extern void *ext4_kvmalloc(size_t size, gfp_t flags);
   extern void *ext4_kvzalloc(size_t size, gfp_t flags);
@@@ -2559,11 -2569,19 +2558,11 @@@ extern const struct file_operations ext
   extern const struct inode_operations ext4_file_inode_operations;
   extern const struct file_operations ext4_file_operations;
   extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
- -extern void ext4_unwritten_wait(struct inode *inode);
   
   /* inline.c */
   extern int ext4_has_inline_data(struct inode *inode);
- -extern int ext4_get_inline_size(struct inode *inode);
   extern int ext4_get_max_inline_size(struct inode *inode);
   extern int ext4_find_inline_data_nolock(struct inode *inode);
- -extern void ext4_write_inline_data(struct inode *inode,
- -                                 struct ext4_iloc *iloc,
- -                                 void *buffer, loff_t pos,
- -                                 unsigned int len);
- -extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
- -                                  unsigned int len);
   extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
                                  unsigned int len);
   extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
@@@ -2751,11 -2769,13 +2750,11 @@@ extern void ext4_io_submit(struct ext4_
   extern int ext4_bio_write_page(struct ext4_io_submit *io,
                                struct page *page,
                                int len,
- -                             struct writeback_control *wbc);
+ +                             struct writeback_control *wbc,
+ +                             bool keep_towrite);
   
   /* mmp.c */
   extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
- -extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
- -extern int ext4_mmp_csum_verify(struct super_block *sb,
- -                              struct mmp_struct *mmp);
   
   /*
    * Note that these flags will never ever appear in a buffer_head's state flag.
diff --combined fs/ext4/file.c

index 4e8bc284ec0e96296e8bbcf68423b9ea9ee8c921,708aad7681991368262332520f09f5490af848de..8695f70af1ef2046c2f68a24a5ed4e195cd6dc88
--- 1/fs/ext4/file.c
--- 2/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@@ -57,7 -57,7 +57,7 @@@ static int ext4_release_file(struct ino
         return 0;
   }
   
- -void ext4_unwritten_wait(struct inode *inode)
+ +static void ext4_unwritten_wait(struct inode *inode)
   {
         wait_queue_head_t *wq = ext4_ioend_wq(inode);
   
@@@ -74,26 -74,22 +74,22 @@@
    * or one thread will zero the other's data, causing corruption.
    */
   static int
- ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
-                  unsigned long nr_segs, loff_t pos)
+ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
   {
         struct super_block *sb = inode->i_sb;
         int blockmask = sb->s_blocksize - 1;
-       size_t count = iov_length(iov, nr_segs);
-       loff_t final_size = pos + count;
   
         if (pos >= i_size_read(inode))
                 return 0;
   
-       if ((pos & blockmask) || (final_size & blockmask))
+       if ((pos | iov_iter_alignment(from)) & blockmask)
                 return 1;
   
         return 0;
   }
   
   static ssize_t
- ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
-               unsigned long nr_segs, loff_t pos)
+ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file_inode(iocb->ki_filp);
@@@ -101,10 -97,9 +97,9 @@@
         struct blk_plug plug;
         int o_direct = file->f_flags & O_DIRECT;
         int overwrite = 0;
-       size_t length = iov_length(iov, nr_segs);
+       size_t length = iov_iter_count(from);
         ssize_t ret;
- 
-       BUG_ON(iocb->ki_pos != pos);
+       loff_t pos = iocb->ki_pos;
   
         /*
          * Unaligned direct AIO must be serialized; see comment above
@@@ -114,7 -109,7 +109,7 @@@
             ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
             !is_sync_kiocb(iocb) &&
             (file->f_flags & O_APPEND ||
-            ext4_unaligned_aio(inode, iov, nr_segs, pos))) {
+            ext4_unaligned_aio(inode, from, pos))) {
                 aio_mutex = ext4_aio_mutex(inode);
                 mutex_lock(aio_mutex);
                 ext4_unwritten_wait(inode);
@@@ -138,10 -133,8 +133,8 @@@
                         goto errout;
                 }
   
-               if (pos + length > sbi->s_bitmap_maxbytes) {
-                       nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
-                                             sbi->s_bitmap_maxbytes - pos);
-               }
+               if (pos + length > sbi->s_bitmap_maxbytes)
+                       iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
         }
   
         if (o_direct) {
@@@ -179,7 -172,7 +172,7 @@@
                 }
         }
   
-       ret = __generic_file_aio_write(iocb, iov, nr_segs);
+       ret = __generic_file_write_iter(iocb, from);
         mutex_unlock(&inode->i_mutex);
   
         if (ret > 0) {
@@@ -244,7 -237,6 +237,7 @@@ static int ext4_file_open(struct inode 
                         handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
                         if (IS_ERR(handle))
                                 return PTR_ERR(handle);
+ +                      BUFFER_TRACE(sbi->s_sbh, "get_write_access");
                         err = ext4_journal_get_write_access(handle, sbi->s_sbh);
                         if (err) {
                                 ext4_journal_stop(handle);
@@@ -594,10 -586,10 +587,10 @@@ loff_t ext4_llseek(struct file *file, l
   
   const struct file_operations ext4_file_operations = {
         .llseek         = ext4_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = generic_file_aio_read,
-       .aio_write      = ext4_file_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = generic_file_read_iter,
+       .write_iter     = ext4_file_write_iter,
         .unlocked_ioctl = ext4_ioctl,
   #ifdef CONFIG_COMPAT
         .compat_ioctl   = ext4_compat_ioctl,
@@@ -607,7 -599,7 +600,7 @@@
         .release        = ext4_release_file,
         .fsync          = ext4_sync_file,
         .splice_read    = generic_file_splice_read,
-       .splice_write   = generic_file_splice_write,
+       .splice_write   = iter_file_splice_write,
         .fallocate      = ext4_fallocate,
   };
   
diff --combined fs/ext4/inode.c

index 7fcd68ee915500cd53ef79cbea1c187e71024004,b2cee73c14375fec3a01fe4f434886867d16a2e1..8a064734e6eb3ed06461e9954d036da6ff1e8147
--- 1/fs/ext4/inode.c
--- 2/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@@ -148,9 -148,6 +148,9 @@@ static int ext4_inode_is_fast_symlink(s
           int ea_blocks = EXT4_I(inode)->i_file_acl ?
                 EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
   
+ +      if (ext4_has_inline_data(inode))
+ +              return 0;
+ +
         return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
   }
   
@@@ -446,7 -443,7 +446,7 @@@ static void ext4_map_blocks_es_recheck(
          * could be converted.
          */
         if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
- -              down_read((&EXT4_I(inode)->i_data_sem));
+ +              down_read(&EXT4_I(inode)->i_data_sem);
         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                 retval = ext4_ext_map_blocks(handle, inode, map, flags &
                                              EXT4_GET_BLOCKS_KEEP_SIZE);
@@@ -558,7 -555,7 +558,7 @@@ int ext4_map_blocks(handle_t *handle, s
          * file system block.
          */
         if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
- -              down_read((&EXT4_I(inode)->i_data_sem));
+ +              down_read(&EXT4_I(inode)->i_data_sem);
         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                 retval = ext4_ext_map_blocks(handle, inode, map, flags &
                                              EXT4_GET_BLOCKS_KEEP_SIZE);
@@@ -630,7 -627,7 +630,7 @@@ found
          * the write lock of i_data_sem, and call get_blocks()
          * with create == 1 flag.
          */
- -      down_write((&EXT4_I(inode)->i_data_sem));
+ +      down_write(&EXT4_I(inode)->i_data_sem);
   
         /*
          * if the caller is from delayed allocation writeout path
@@@ -925,7 -922,6 +925,7 @@@ int do_journal_get_write_access(handle_
          */
         if (dirty)
                 clear_buffer_dirty(bh);
+ +      BUFFER_TRACE(bh, "get write access");
         ret = ext4_journal_get_write_access(handle, bh);
         if (!ret && dirty)
                 ret = ext4_handle_dirty_metadata(handle, NULL, bh);
@@@ -1544,7 -1540,7 +1544,7 @@@ static int ext4_da_map_blocks(struct in
                 ext4_es_lru_add(inode);
                 if (ext4_es_is_hole(&es)) {
                         retval = 0;
- -                      down_read((&EXT4_I(inode)->i_data_sem));
+ +                      down_read(&EXT4_I(inode)->i_data_sem);
                         goto add_delayed;
                 }
   
@@@ -1581,7 -1577,7 +1581,7 @@@
          * Try to see if we can get the block without requesting a new
          * file system block.
          */
- -      down_read((&EXT4_I(inode)->i_data_sem));
+ +      down_read(&EXT4_I(inode)->i_data_sem);
         if (ext4_has_inline_data(inode)) {
                 /*
                  * We will soon create blocks for this page, and let
@@@ -1773,7 -1769,6 +1773,7 @@@ static int __ext4_journalled_writepage(
         BUG_ON(!ext4_handle_valid(handle));
   
         if (inline_data) {
+ +              BUFFER_TRACE(inode_bh, "get write access");
                 ret = ext4_journal_get_write_access(handle, inode_bh);
   
                 err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
@@@ -1851,7 -1846,6 +1851,7 @@@ static int ext4_writepage(struct page *
         struct buffer_head *page_bufs = NULL;
         struct inode *inode = page->mapping->host;
         struct ext4_io_submit io_submit;
+ +      bool keep_towrite = false;
   
         trace_ext4_writepage(page);
         size = i_size_read(inode);
@@@ -1882,7 -1876,6 +1882,7 @@@
                         unlock_page(page);
                         return 0;
                 }
+ +              keep_towrite = true;
         }
   
         if (PageChecked(page) && ext4_should_journal_data(inode))
@@@ -1899,7 -1892,7 +1899,7 @@@
                 unlock_page(page);
                 return -ENOMEM;
         }
- -      ret = ext4_bio_write_page(&io_submit, page, len, wbc);
+ +      ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
         ext4_io_submit(&io_submit);
         /* Drop io_end reference we got from init */
         ext4_put_io_end_defer(io_submit.io_end);
@@@ -1918,7 -1911,7 +1918,7 @@@ static int mpage_submit_page(struct mpa
         else
                 len = PAGE_CACHE_SIZE;
         clear_page_dirty_for_io(page);
- -      err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
+ +      err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
         if (!err)
                 mpd->wbc->nr_to_write--;
         mpd->first_page++;
@@@ -3093,13 -3086,12 +3093,12 @@@ static void ext4_end_io_dio(struct kioc
    *
    */
   static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
-                             const struct iovec *iov, loff_t offset,
-                             unsigned long nr_segs)
+                             struct iov_iter *iter, loff_t offset)
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
         ssize_t ret;
-       size_t count = iov_length(iov, nr_segs);
+       size_t count = iov_iter_count(iter);
         int overwrite = 0;
         get_block_t *get_block_func = NULL;
         int dio_flags = 0;
@@@ -3108,7 -3100,7 +3107,7 @@@
   
         /* Use the old path for reads and writes beyond i_size. */
         if (rw != WRITE || final_size > inode->i_size)
-               return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+               return ext4_ind_direct_IO(rw, iocb, iter, offset);
   
         BUG_ON(iocb->private == NULL);
   
@@@ -3175,8 -3167,8 +3174,8 @@@
                 dio_flags = DIO_LOCKING;
         }
         ret = __blockdev_direct_IO(rw, iocb, inode,
-                                  inode->i_sb->s_bdev, iov,
-                                  offset, nr_segs,
+                                  inode->i_sb->s_bdev, iter,
+                                  offset,
                                    get_block_func,
                                    ext4_end_io_dio,
                                    NULL,
@@@ -3230,11 -3222,11 +3229,11 @@@ retake_lock
   }
   
   static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
-                             const struct iovec *iov, loff_t offset,
-                             unsigned long nr_segs)
+                             struct iov_iter *iter, loff_t offset)
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
+       size_t count = iov_iter_count(iter);
         ssize_t ret;
   
         /*
@@@ -3247,13 -3239,12 +3246,12 @@@
         if (ext4_has_inline_data(inode))
                 return 0;
   
-       trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
+       trace_ext4_direct_IO_enter(inode, offset, count, rw);
         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-               ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+               ret = ext4_ext_direct_IO(rw, iocb, iter, offset);
         else
-               ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
-       trace_ext4_direct_IO_exit(inode, offset,
-                               iov_length(iov, nr_segs), rw, ret);
+               ret = ext4_ind_direct_IO(rw, iocb, iter, offset);
+       trace_ext4_direct_IO_exit(inode, offset, count, rw, ret);
         return ret;
   }
   
@@@ -3448,7 -3439,7 +3446,7 @@@ unlock
    * This required during truncate. We need to physically zero the tail end
    * of that block so it doesn't yield old data if the file is later grown.
    */
- -int ext4_block_truncate_page(handle_t *handle,
+ +static int ext4_block_truncate_page(handle_t *handle,
                 struct address_space *mapping, loff_t from)
   {
         unsigned offset = from & (PAGE_CACHE_SIZE-1);
@@@ -4312,15 -4303,12 +4310,15 @@@ static int ext4_do_update_inode(handle_
         struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
         struct ext4_inode_info *ei = EXT4_I(inode);
         struct buffer_head *bh = iloc->bh;
+ +      struct super_block *sb = inode->i_sb;
         int err = 0, rc, block;
- -      int need_datasync = 0;
+ +      int need_datasync = 0, set_large_file = 0;
         uid_t i_uid;
         gid_t i_gid;
   
- -      /* For fields not not tracking in the in-memory inode,
+ +      spin_lock(&ei->i_raw_lock);
+ +
+ +      /* For fields not tracked in the in-memory inode,
          * initialise them to zero for new inodes. */
         if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
                 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
@@@ -4358,10 -4346,8 +4356,10 @@@
         EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
         EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
   
- -      if (ext4_inode_blocks_set(handle, raw_inode, ei))
+ +      if (ext4_inode_blocks_set(handle, raw_inode, ei)) {
+ +              spin_unlock(&ei->i_raw_lock);
                 goto out_brelse;
+ +      }
         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
         raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
         if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
@@@ -4373,11 -4359,24 +4371,11 @@@
                 need_datasync = 1;
         }
         if (ei->i_disksize > 0x7fffffffULL) {
- -              struct super_block *sb = inode->i_sb;
                 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
                                 EXT4_SB(sb)->s_es->s_rev_level ==
- -                              cpu_to_le32(EXT4_GOOD_OLD_REV)) {
- -                      /* If this is the first large file
- -                       * created, add a flag to the superblock.
- -                       */
- -                      err = ext4_journal_get_write_access(handle,
- -                                      EXT4_SB(sb)->s_sbh);
- -                      if (err)
- -                              goto out_brelse;
- -                      ext4_update_dynamic_rev(sb);
- -                      EXT4_SET_RO_COMPAT_FEATURE(sb,
- -                                      EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
- -                      ext4_handle_sync(handle);
- -                      err = ext4_handle_dirty_super(handle, sb);
- -              }
+ +                  cpu_to_le32(EXT4_GOOD_OLD_REV))
+ +                      set_large_file = 1;
         }
         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
@@@ -4409,24 -4408,12 +4407,24 @@@
   
         ext4_inode_csum_set(inode, raw_inode, ei);
   
+ +      spin_unlock(&ei->i_raw_lock);
+ +
         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
         rc = ext4_handle_dirty_metadata(handle, NULL, bh);
         if (!err)
                 err = rc;
         ext4_clear_inode_state(inode, EXT4_STATE_NEW);
- -
+ +      if (set_large_file) {
+ +              BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
+ +              err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ +              if (err)
+ +                      goto out_brelse;
+ +              ext4_update_dynamic_rev(sb);
+ +              EXT4_SET_RO_COMPAT_FEATURE(sb,
+ +                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+ +              ext4_handle_sync(handle);
+ +              err = ext4_handle_dirty_super(handle, sb);
+ +      }
         ext4_update_inode_fsync_trans(handle, inode, need_datasync);
   out_brelse:
         brelse(bh);
diff --combined fs/f2fs/data.c

index c1fb6dd10911c01e9b37d533a7588ee6bf934ecb,1d2e7e9624d2c4f5fc372a0f86be916169deebe4..0924521306b40c5087f2c2170c92fe7b03452862
--- 1/fs/f2fs/data.c
--- 2/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@@ -417,7 -417,7 +417,7 @@@ struct page *find_data_page(struct inod
         if (unlikely(dn.data_blkaddr == NEW_ADDR))
                 return ERR_PTR(-EINVAL);
   
- -      page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+ +      page = grab_cache_page(mapping, index);
         if (!page)
                 return ERR_PTR(-ENOMEM);
   
@@@ -455,7 -455,7 +455,7 @@@ struct page *get_lock_data_page(struct 
         int err;
   
   repeat:
- -      page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+ +      page = grab_cache_page(mapping, index);
         if (!page)
                 return ERR_PTR(-ENOMEM);
   
@@@ -652,7 -652,8 +652,7 @@@ static int get_data_block(struct inode 
                 goto put_out;
         }
   
- -      end_offset = IS_INODE(dn.node_page) ?
- -                      ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
+ +      end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
         bh_result->b_size = (((size_t)1) << blkbits);
         dn.ofs_in_node++;
         pgofs++;
@@@ -674,7 -675,8 +674,7 @@@ get_next
                 if (dn.data_blkaddr == NEW_ADDR)
                         goto put_out;
   
- -              end_offset = IS_INODE(dn.node_page) ?
- -                      ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
+ +              end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
         }
   
         if (maxblocks > (bh_result->b_size >> blkbits)) {
@@@ -708,19 -710,11 +708,19 @@@ out
         return err;
   }
   
+ +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ +              u64 start, u64 len)
+ +{
+ +      return generic_block_fiemap(inode, fieinfo, start, len, get_data_block);
+ +}
+ +
   static int f2fs_read_data_page(struct file *file, struct page *page)
   {
         struct inode *inode = page->mapping->host;
         int ret;
   
+ +      trace_f2fs_readpage(page, DATA);
+ +
         /* If the file has inline data, try to read it directlly */
         if (f2fs_has_inline_data(inode))
                 ret = f2fs_read_inline_data(inode, page);
@@@ -796,8 -790,6 +796,8 @@@ static int f2fs_write_data_page(struct 
                 .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
         };
   
+ +      trace_f2fs_writepage(page, DATA);
+ +
         if (page->index < end_index)
                 goto write;
   
@@@ -806,8 -798,10 +806,8 @@@
          * this page does not have to be written to disk.
          */
         offset = i_size & (PAGE_CACHE_SIZE - 1);
- -      if ((page->index >= end_index + 1) || !offset) {
- -              inode_dec_dirty_dents(inode);
+ +      if ((page->index >= end_index + 1) || !offset)
                 goto out;
- -      }
   
         zero_user_segment(page, offset, PAGE_CACHE_SIZE);
   write:
@@@ -816,6 -810,7 +816,6 @@@
   
         /* Dentry blocks are controlled by checkpoint */
         if (S_ISDIR(inode->i_mode)) {
- -              inode_dec_dirty_dents(inode);
                 err = do_write_data_page(page, &fio);
                 goto done;
         }
@@@ -837,16 -832,15 +837,16 @@@ done
   
         clear_cold_data(page);
   out:
+ +      inode_dec_dirty_dents(inode);
         unlock_page(page);
         if (need_balance_fs)
                 f2fs_balance_fs(sbi);
+ +      if (wbc->for_reclaim)
+ +              f2fs_submit_merged_bio(sbi, DATA, WRITE);
         return 0;
   
   redirty_out:
- -      wbc->pages_skipped++;
- -      account_page_redirty(page);
- -      set_page_dirty(page);
+ +      redirty_page_for_writepage(wbc, page);
         return AOP_WRITEPAGE_ACTIVATE;
   }
   
@@@ -868,15 -862,12 +868,15 @@@ static int f2fs_write_data_pages(struc
         int ret;
         long diff;
   
+ +      trace_f2fs_writepages(mapping->host, wbc, DATA);
+ +
         /* deal with chardevs and other special file */
         if (!mapping->a_ops->writepage)
                 return 0;
   
         if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
- -                      get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA))
+ +                      get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA) &&
+ +                      available_free_memory(sbi, DIRTY_DENTS))
                 goto skip_write;
   
         diff = nr_pages_to_write(sbi, DATA, wbc);
@@@ -912,8 -903,6 +912,8 @@@ static int f2fs_write_begin(struct fil
         struct dnode_of_data dn;
         int err = 0;
   
+ +      trace_f2fs_write_begin(inode, pos, len, flags);
+ +
         f2fs_balance_fs(sbi);
   repeat:
         err = f2fs_convert_inline_data(inode, pos + len);
@@@ -923,10 -912,6 +923,10 @@@
         page = grab_cache_page_write_begin(mapping, index, flags);
         if (!page)
                 return -ENOMEM;
+ +
+ +      /* to avoid latency during memory pressure */
+ +      unlock_page(page);
+ +
         *pagep = page;
   
         if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA)
@@@ -938,18 -923,10 +938,18 @@@
         f2fs_unlock_op(sbi);
   
         if (err) {
- -              f2fs_put_page(page, 1);
+ +              f2fs_put_page(page, 0);
                 return err;
         }
   inline_data:
+ +      lock_page(page);
+ +      if (unlikely(page->mapping != mapping)) {
+ +              f2fs_put_page(page, 1);
+ +              goto repeat;
+ +      }
+ +
+ +      f2fs_wait_on_page_writeback(page, DATA);
+ +
         if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
                 return 0;
   
@@@ -1001,8 -978,6 +1001,8 @@@ static int f2fs_write_end(struct file *
   {
         struct inode *inode = page->mapping->host;
   
+ +      trace_f2fs_write_end(inode, pos, len, copied);
+ +
         SetPageUptodate(page);
         set_page_dirty(page);
   
@@@ -1017,10 -992,9 +1017,9 @@@
   }
   
   static int check_direct_IO(struct inode *inode, int rw,
-               const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+               struct iov_iter *iter, loff_t offset)
   {
         unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
-       int i;
   
         if (rw == READ)
                 return 0;
@@@ -1028,14 -1002,14 +1027,14 @@@
         if (offset & blocksize_mask)
                 return -EINVAL;
   
-       for (i = 0; i < nr_segs; i++)
-               if (iov[i].iov_len & blocksize_mask)
-                       return -EINVAL;
+       if (iov_iter_alignment(iter) & blocksize_mask)
+               return -EINVAL;
+ 
         return 0;
   }
   
   static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
-               const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+               struct iov_iter *iter, loff_t offset)
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
@@@ -1044,14 -1018,11 +1043,14 @@@
         if (f2fs_has_inline_data(inode))
                 return 0;
   
-       if (check_direct_IO(inode, rw, iov, offset, nr_segs))
+       if (check_direct_IO(inode, rw, iter, offset))
                 return 0;
   
-       return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-                                                       get_data_block);
+ +      /* clear fsync mark to recover these blocks */
+ +      fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino);
+ +
+       return blockdev_direct_IO(rw, iocb, inode, iter, offset,
+                                 get_data_block);
   }
   
   static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
@@@ -1089,11 -1060,6 +1088,11 @@@ static int f2fs_set_data_page_dirty(str
   
   static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
   {
+ +      struct inode *inode = mapping->host;
+ +
+ +      if (f2fs_has_inline_data(inode))
+ +              return 0;
+ +
         return generic_block_bmap(mapping, block, get_data_block);
   }
   
diff --combined fs/f2fs/file.c

index 9c49c593d8eb4ab39a1aa28c1b841f949d02c050,e4ba4b93f96a90e70c173fa309b242c0390ef8cf..c58e330757191392656d2819fd937a1cc564cb37
--- 1/fs/f2fs/file.c
--- 2/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@@ -19,7 -19,6 +19,7 @@@
   #include <linux/compat.h>
   #include <linux/uaccess.h>
   #include <linux/mount.h>
+ +#include <linux/pagevec.h>
   
   #include "f2fs.h"
   #include "node.h"
@@@ -195,132 -194,6 +195,132 @@@ out
         return ret;
   }
   
+ +static pgoff_t __get_first_dirty_index(struct address_space *mapping,
+ +                                              pgoff_t pgofs, int whence)
+ +{
+ +      struct pagevec pvec;
+ +      int nr_pages;
+ +
+ +      if (whence != SEEK_DATA)
+ +              return 0;
+ +
+ +      /* find first dirty page index */
+ +      pagevec_init(&pvec, 0);
+ +      nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1);
+ +      pgofs = nr_pages ? pvec.pages[0]->index: LONG_MAX;
+ +      pagevec_release(&pvec);
+ +      return pgofs;
+ +}
+ +
+ +static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs,
+ +                                                      int whence)
+ +{
+ +      switch (whence) {
+ +      case SEEK_DATA:
+ +              if ((blkaddr == NEW_ADDR && dirty == pgofs) ||
+ +                      (blkaddr != NEW_ADDR && blkaddr != NULL_ADDR))
+ +                      return true;
+ +              break;
+ +      case SEEK_HOLE:
+ +              if (blkaddr == NULL_ADDR)
+ +                      return true;
+ +              break;
+ +      }
+ +      return false;
+ +}
+ +
+ +static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
+ +{
+ +      struct inode *inode = file->f_mapping->host;
+ +      loff_t maxbytes = inode->i_sb->s_maxbytes;
+ +      struct dnode_of_data dn;
+ +      pgoff_t pgofs, end_offset, dirty;
+ +      loff_t data_ofs = offset;
+ +      loff_t isize;
+ +      int err = 0;
+ +
+ +      mutex_lock(&inode->i_mutex);
+ +
+ +      isize = i_size_read(inode);
+ +      if (offset >= isize)
+ +              goto fail;
+ +
+ +      /* handle inline data case */
+ +      if (f2fs_has_inline_data(inode)) {
+ +              if (whence == SEEK_HOLE)
+ +                      data_ofs = isize;
+ +              goto found;
+ +      }
+ +
+ +      pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT);
+ +
+ +      dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence);
+ +
+ +      for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) {
+ +              set_new_dnode(&dn, inode, NULL, NULL, 0);
+ +              err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
+ +              if (err && err != -ENOENT) {
+ +                      goto fail;
+ +              } else if (err == -ENOENT) {
+ +                      /* direct node is not exist */
+ +                      if (whence == SEEK_DATA) {
+ +                              pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
+ +                                                      F2FS_I(inode));
+ +                              continue;
+ +                      } else {
+ +                              goto found;
+ +                      }
+ +              }
+ +
+ +              end_offset = IS_INODE(dn.node_page) ?
+ +                      ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
+ +
+ +              /* find data/hole in dnode block */
+ +              for (; dn.ofs_in_node < end_offset;
+ +                              dn.ofs_in_node++, pgofs++,
+ +                              data_ofs = pgofs << PAGE_CACHE_SHIFT) {
+ +                      block_t blkaddr;
+ +                      blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+ +
+ +                      if (__found_offset(blkaddr, dirty, pgofs, whence)) {
+ +                              f2fs_put_dnode(&dn);
+ +                              goto found;
+ +                      }
+ +              }
+ +              f2fs_put_dnode(&dn);
+ +      }
+ +
+ +      if (whence == SEEK_DATA)
+ +              goto fail;
+ +found:
+ +      if (whence == SEEK_HOLE && data_ofs > isize)
+ +              data_ofs = isize;
+ +      mutex_unlock(&inode->i_mutex);
+ +      return vfs_setpos(file, data_ofs, maxbytes);
+ +fail:
+ +      mutex_unlock(&inode->i_mutex);
+ +      return -ENXIO;
+ +}
+ +
+ +static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
+ +{
+ +      struct inode *inode = file->f_mapping->host;
+ +      loff_t maxbytes = inode->i_sb->s_maxbytes;
+ +
+ +      switch (whence) {
+ +      case SEEK_SET:
+ +      case SEEK_CUR:
+ +      case SEEK_END:
+ +              return generic_file_llseek_size(file, offset, whence,
+ +                                              maxbytes, i_size_read(inode));
+ +      case SEEK_DATA:
+ +      case SEEK_HOLE:
+ +              return f2fs_seek_block(file, offset, whence);
+ +      }
+ +
+ +      return -EINVAL;
+ +}
+ +
   static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
   {
         file_accessed(file);
@@@ -369,9 -242,6 +369,9 @@@ static void truncate_partial_data_page(
         unsigned offset = from & (PAGE_CACHE_SIZE - 1);
         struct page *page;
   
+ +      if (f2fs_has_inline_data(inode))
+ +              return truncate_inline_data(inode, from);
+ +
         if (!offset)
                 return;
   
@@@ -418,7 -288,10 +418,7 @@@ int truncate_blocks(struct inode *inode
                 return err;
         }
   
- -      if (IS_INODE(dn.node_page))
- -              count = ADDRS_PER_INODE(F2FS_I(inode));
- -      else
- -              count = ADDRS_PER_BLOCK;
+ +      count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
   
         count -= dn.ofs_in_node;
         f2fs_bug_on(count < 0);
@@@ -540,7 -413,6 +540,7 @@@ const struct inode_operations f2fs_file
         .listxattr      = f2fs_listxattr,
         .removexattr    = generic_removexattr,
   #endif
+ +      .fiemap         = f2fs_fiemap,
   };
   
   static void fill_zero(struct inode *inode, pgoff_t index,
@@@ -683,7 -555,6 +683,7 @@@ static int expand_inode_data(struct ino
                 i_size_read(inode) < new_size) {
                 i_size_write(inode, new_size);
                 mark_inode_dirty(inode);
+ +              f2fs_write_inode(inode, NULL);
         }
   
         return ret;
@@@ -807,11 -678,11 +807,11 @@@ long f2fs_compat_ioctl(struct file *fil
   #endif
   
   const struct file_operations f2fs_file_operations = {
- -      .llseek         = generic_file_llseek,
+ +      .llseek         = f2fs_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = generic_file_aio_read,
-       .aio_write      = generic_file_aio_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = generic_file_read_iter,
+       .write_iter     = generic_file_write_iter,
         .open           = generic_file_open,
         .mmap           = f2fs_file_mmap,
         .fsync          = f2fs_sync_file,
@@@ -821,5 -692,5 +821,5 @@@
         .compat_ioctl   = f2fs_compat_ioctl,
   #endif
         .splice_read    = generic_file_splice_read,
-       .splice_write   = generic_file_splice_write,
+       .splice_write   = iter_file_splice_write,
   };
diff --combined fs/fat/inode.c

index 9c83594d7fb5dbb03e5eb7ca378c4b35ffd3d85f,385cce464e822a3a9de91113d1b314482670965f..756aead10d9618593e3267e697df4915d528fbc9
--- 1/fs/fat/inode.c
--- 2/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@@ -35,71 -35,9 +35,71 @@@
   #define CONFIG_FAT_DEFAULT_IOCHARSET  ""
   #endif
   
+ +#define KB_IN_SECTORS 2
+ +
+ +/*
+ + * A deserialized copy of the on-disk structure laid out in struct
+ + * fat_boot_sector.
+ + */
+ +struct fat_bios_param_block {
+ +      u16     fat_sector_size;
+ +      u8      fat_sec_per_clus;
+ +      u16     fat_reserved;
+ +      u8      fat_fats;
+ +      u16     fat_dir_entries;
+ +      u16     fat_sectors;
+ +      u16     fat_fat_length;
+ +      u32     fat_total_sect;
+ +
+ +      u8      fat16_state;
+ +      u32     fat16_vol_id;
+ +
+ +      u32     fat32_length;
+ +      u32     fat32_root_cluster;
+ +      u16     fat32_info_sector;
+ +      u8      fat32_state;
+ +      u32     fat32_vol_id;
+ +};
+ +
   static int fat_default_codepage = CONFIG_FAT_DEFAULT_CODEPAGE;
   static char fat_default_iocharset[] = CONFIG_FAT_DEFAULT_IOCHARSET;
   
+ +static struct fat_floppy_defaults {
+ +      unsigned nr_sectors;
+ +      unsigned sec_per_clus;
+ +      unsigned dir_entries;
+ +      unsigned media;
+ +      unsigned fat_length;
+ +} floppy_defaults[] = {
+ +{
+ +      .nr_sectors = 160 * KB_IN_SECTORS,
+ +      .sec_per_clus = 1,
+ +      .dir_entries = 64,
+ +      .media = 0xFE,
+ +      .fat_length = 1,
+ +},
+ +{
+ +      .nr_sectors = 180 * KB_IN_SECTORS,
+ +      .sec_per_clus = 1,
+ +      .dir_entries = 64,
+ +      .media = 0xFC,
+ +      .fat_length = 2,
+ +},
+ +{
+ +      .nr_sectors = 320 * KB_IN_SECTORS,
+ +      .sec_per_clus = 2,
+ +      .dir_entries = 112,
+ +      .media = 0xFF,
+ +      .fat_length = 1,
+ +},
+ +{
+ +      .nr_sectors = 360 * KB_IN_SECTORS,
+ +      .sec_per_clus = 2,
+ +      .dir_entries = 112,
+ +      .media = 0xFD,
+ +      .fat_length = 2,
+ +},
+ +};
   
   static int fat_add_cluster(struct inode *inode)
   {
@@@ -247,12 -185,13 +247,13 @@@ static int fat_write_end(struct file *f
   }
   
   static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
-                            const struct iovec *iov,
-                            loff_t offset, unsigned long nr_segs)
+                            struct iov_iter *iter,
+                            loff_t offset)
   {
         struct file *file = iocb->ki_filp;
         struct address_space *mapping = file->f_mapping;
         struct inode *inode = mapping->host;
+       size_t count = iov_iter_count(iter);
         ssize_t ret;
   
         if (rw == WRITE) {
@@@ -265,7 -204,7 +266,7 @@@
                  *
                  * Return 0, and fallback to normal buffered write.
                  */
-               loff_t size = offset + iov_length(iov, nr_segs);
+               loff_t size = offset + count;
                 if (MSDOS_I(inode)->mmu_private < size)
                         return 0;
         }
@@@ -274,10 -213,9 +275,9 @@@
          * FAT need to use the DIO_LOCKING for avoiding the race
          * condition of fat_get_block() and ->truncate().
          */
-       ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-                                fat_get_block);
+       ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, fat_get_block);
         if (ret < 0 && (rw & WRITE))
-               fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
+               fat_write_failed(mapping, offset + count);
   
         return ret;
   }
@@@ -421,7 -359,7 +421,7 @@@ struct inode *fat_iget(struct super_blo
   
   static int is_exec(unsigned char *extension)
   {
- -      unsigned char *exe_extensions = "EXECOMBAT", *walk;
+ +      unsigned char exe_extensions[] = "EXECOMBAT", *walk;
   
         for (walk = exe_extensions; *walk; walk += 3)
                 if (!strncmp(extension, walk, 3))
@@@ -915,8 -853,6 +915,8 @@@ static int fat_show_options(struct seq_
                 seq_puts(m, ",nfs=stale_rw");
         if (opts->discard)
                 seq_puts(m, ",discard");
+ +      if (opts->dos1xfloppy)
+ +              seq_puts(m, ",dos1xfloppy");
   
         return 0;
   }
@@@ -931,7 -867,7 +931,7 @@@ enum 
         Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
         Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
         Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset,
- -      Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err,
+ +      Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy,
   };
   
   static const match_table_t fat_tokens = {
@@@ -964,7 -900,6 +964,7 @@@
         {Opt_nfs_stale_rw, "nfs"},
         {Opt_nfs_stale_rw, "nfs=stale_rw"},
         {Opt_nfs_nostale_ro, "nfs=nostale_ro"},
+ +      {Opt_dos1xfloppy, "dos1xfloppy"},
         {Opt_obsolete, "conv=binary"},
         {Opt_obsolete, "conv=text"},
         {Opt_obsolete, "conv=auto"},
@@@ -1167,9 -1102,6 +1167,9 @@@ static int parse_options(struct super_b
                 case Opt_nfs_nostale_ro:
                         opts->nfs = FAT_NFS_NOSTALE_RO;
                         break;
+ +              case Opt_dos1xfloppy:
+ +                      opts->dos1xfloppy = 1;
+ +                      break;
   
                 /* msdos specific */
                 case Opt_dots:
@@@ -1315,169 -1247,6 +1315,169 @@@ static unsigned long calc_fat_clusters(
         return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;
   }
   
+ +static bool fat_bpb_is_zero(struct fat_boot_sector *b)
+ +{
+ +      if (get_unaligned_le16(&b->sector_size))
+ +              return false;
+ +      if (b->sec_per_clus)
+ +              return false;
+ +      if (b->reserved)
+ +              return false;
+ +      if (b->fats)
+ +              return false;
+ +      if (get_unaligned_le16(&b->dir_entries))
+ +              return false;
+ +      if (get_unaligned_le16(&b->sectors))
+ +              return false;
+ +      if (b->media)
+ +              return false;
+ +      if (b->fat_length)
+ +              return false;
+ +      if (b->secs_track)
+ +              return false;
+ +      if (b->heads)
+ +              return false;
+ +      return true;
+ +}
+ +
+ +static int fat_read_bpb(struct super_block *sb, struct fat_boot_sector *b,
+ +      int silent, struct fat_bios_param_block *bpb)
+ +{
+ +      int error = -EINVAL;
+ +
+ +      /* Read in BPB ... */
+ +      memset(bpb, 0, sizeof(*bpb));
+ +      bpb->fat_sector_size = get_unaligned_le16(&b->sector_size);
+ +      bpb->fat_sec_per_clus = b->sec_per_clus;
+ +      bpb->fat_reserved = le16_to_cpu(b->reserved);
+ +      bpb->fat_fats = b->fats;
+ +      bpb->fat_dir_entries = get_unaligned_le16(&b->dir_entries);
+ +      bpb->fat_sectors = get_unaligned_le16(&b->sectors);
+ +      bpb->fat_fat_length = le16_to_cpu(b->fat_length);
+ +      bpb->fat_total_sect = le32_to_cpu(b->total_sect);
+ +
+ +      bpb->fat16_state = b->fat16.state;
+ +      bpb->fat16_vol_id = get_unaligned_le32(b->fat16.vol_id);
+ +
+ +      bpb->fat32_length = le32_to_cpu(b->fat32.length);
+ +      bpb->fat32_root_cluster = le32_to_cpu(b->fat32.root_cluster);
+ +      bpb->fat32_info_sector = le16_to_cpu(b->fat32.info_sector);
+ +      bpb->fat32_state = b->fat32.state;
+ +      bpb->fat32_vol_id = get_unaligned_le32(b->fat32.vol_id);
+ +
+ +      /* Validate this looks like a FAT filesystem BPB */
+ +      if (!bpb->fat_reserved) {
+ +              if (!silent)
+ +                      fat_msg(sb, KERN_ERR,
+ +                              "bogus number of reserved sectors");
+ +              goto out;
+ +      }
+ +      if (!bpb->fat_fats) {
+ +              if (!silent)
+ +                      fat_msg(sb, KERN_ERR, "bogus number of FAT structure");
+ +              goto out;
+ +      }
+ +
+ +      /*
+ +       * Earlier we checked here that b->secs_track and b->head are nonzero,
+ +       * but it turns out valid FAT filesystems can have zero there.
+ +       */
+ +
+ +      if (!fat_valid_media(b->media)) {
+ +              if (!silent)
+ +                      fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)",
+ +                              (unsigned)b->media);
+ +              goto out;
+ +      }
+ +
+ +      if (!is_power_of_2(bpb->fat_sector_size)
+ +          || (bpb->fat_sector_size < 512)
+ +          || (bpb->fat_sector_size > 4096)) {
+ +              if (!silent)
+ +                      fat_msg(sb, KERN_ERR, "bogus logical sector size %u",
+ +                             (unsigned)bpb->fat_sector_size);
+ +              goto out;
+ +      }
+ +
+ +      if (!is_power_of_2(bpb->fat_sec_per_clus)) {
+ +              if (!silent)
+ +                      fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u",
+ +                              (unsigned)bpb->fat_sec_per_clus);
+ +              goto out;
+ +      }
+ +
+ +      error = 0;
+ +
+ +out:
+ +      return error;
+ +}
+ +
+ +static int fat_read_static_bpb(struct super_block *sb,
+ +      struct fat_boot_sector *b, int silent,
+ +      struct fat_bios_param_block *bpb)
+ +{
+ +      static const char *notdos1x = "This doesn't look like a DOS 1.x volume";
+ +
+ +      struct fat_floppy_defaults *fdefaults = NULL;
+ +      int error = -EINVAL;
+ +      sector_t bd_sects;
+ +      unsigned i;
+ +
+ +      bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE;
+ +
+ +      /* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */
+ +      if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) {
+ +              if (!silent)
+ +                      fat_msg(sb, KERN_ERR,
+ +                              "%s; no bootstrapping code", notdos1x);
+ +              goto out;
+ +      }
+ +
+ +      /*
+ +       * If any value in this region is non-zero, it isn't archaic
+ +       * DOS.
+ +       */
+ +      if (!fat_bpb_is_zero(b)) {
+ +              if (!silent)
+ +                      fat_msg(sb, KERN_ERR,
+ +                              "%s; DOS 2.x BPB is non-zero", notdos1x);
+ +              goto out;
+ +      }
+ +
+ +      for (i = 0; i < ARRAY_SIZE(floppy_defaults); i++) {
+ +              if (floppy_defaults[i].nr_sectors == bd_sects) {
+ +                      fdefaults = &floppy_defaults[i];
+ +                      break;
+ +              }
+ +      }
+ +
+ +      if (fdefaults == NULL) {
+ +              if (!silent)
+ +                      fat_msg(sb, KERN_WARNING,
+ +                              "This looks like a DOS 1.x volume, but isn't a recognized floppy size (%llu sectors)",
+ +                              (u64)bd_sects);
+ +              goto out;
+ +      }
+ +
+ +      if (!silent)
+ +              fat_msg(sb, KERN_INFO,
+ +                      "This looks like a DOS 1.x volume; assuming default BPB values");
+ +
+ +      memset(bpb, 0, sizeof(*bpb));
+ +      bpb->fat_sector_size = SECTOR_SIZE;
+ +      bpb->fat_sec_per_clus = fdefaults->sec_per_clus;
+ +      bpb->fat_reserved = 1;
+ +      bpb->fat_fats = 2;
+ +      bpb->fat_dir_entries = fdefaults->dir_entries;
+ +      bpb->fat_sectors = fdefaults->nr_sectors;
+ +      bpb->fat_fat_length = fdefaults->fat_length;
+ +
+ +      error = 0;
+ +
+ +out:
+ +      return error;
+ +}
+ +
   /*
    * Read the super block of an MS-DOS FS.
    */
@@@ -1487,11 -1256,12 +1487,11 @@@ int fat_fill_super(struct super_block *
         struct inode *root_inode = NULL, *fat_inode = NULL;
         struct inode *fsinfo_inode = NULL;
         struct buffer_head *bh;
- -      struct fat_boot_sector *b;
+ +      struct fat_bios_param_block bpb;
         struct msdos_sb_info *sbi;
         u16 logical_sector_size;
         u32 total_sectors, total_clusters, fat_clusters, rootdir_sectors;
         int debug;
- -      unsigned int media;
         long error;
         char buf[50];
   
@@@ -1528,72 -1298,100 +1528,72 @@@
                 goto out_fail;
         }
   
- -      b = (struct fat_boot_sector *) bh->b_data;
- -      if (!b->reserved) {
- -              if (!silent)
- -                      fat_msg(sb, KERN_ERR, "bogus number of reserved sectors");
- -              brelse(bh);
- -              goto out_invalid;
- -      }
- -      if (!b->fats) {
- -              if (!silent)
- -                      fat_msg(sb, KERN_ERR, "bogus number of FAT structure");
- -              brelse(bh);
- -              goto out_invalid;
- -      }
- -
- -      /*
- -       * Earlier we checked here that b->secs_track and b->head are nonzero,
- -       * but it turns out valid FAT filesystems can have zero there.
- -       */
+ +      error = fat_read_bpb(sb, (struct fat_boot_sector *)bh->b_data, silent,
+ +              &bpb);
+ +      if (error == -EINVAL && sbi->options.dos1xfloppy)
+ +              error = fat_read_static_bpb(sb,
+ +                      (struct fat_boot_sector *)bh->b_data, silent, &bpb);
+ +      brelse(bh);
   
- -      media = b->media;
- -      if (!fat_valid_media(media)) {
- -              if (!silent)
- -                      fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)",
- -                             media);
- -              brelse(bh);
+ +      if (error == -EINVAL)
                 goto out_invalid;
- -      }
- -      logical_sector_size = get_unaligned_le16(&b->sector_size);
- -      if (!is_power_of_2(logical_sector_size)
- -          || (logical_sector_size < 512)
- -          || (logical_sector_size > 4096)) {
- -              if (!silent)
- -                      fat_msg(sb, KERN_ERR, "bogus logical sector size %u",
- -                             logical_sector_size);
- -              brelse(bh);
- -              goto out_invalid;
- -      }
- -      sbi->sec_per_clus = b->sec_per_clus;
- -      if (!is_power_of_2(sbi->sec_per_clus)) {
- -              if (!silent)
- -                      fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u",
- -                             sbi->sec_per_clus);
- -              brelse(bh);
- -              goto out_invalid;
- -      }
+ +      else if (error)
+ +              goto out_fail;
   
+ +      logical_sector_size = bpb.fat_sector_size;
+ +      sbi->sec_per_clus = bpb.fat_sec_per_clus;
+ +
+ +      error = -EIO;
         if (logical_sector_size < sb->s_blocksize) {
                 fat_msg(sb, KERN_ERR, "logical sector size too small for device"
                        " (logical sector size = %u)", logical_sector_size);
- -              brelse(bh);
                 goto out_fail;
         }
+ +
         if (logical_sector_size > sb->s_blocksize) {
- -              brelse(bh);
+ +              struct buffer_head *bh_resize;
   
                 if (!sb_set_blocksize(sb, logical_sector_size)) {
                         fat_msg(sb, KERN_ERR, "unable to set blocksize %u",
                                logical_sector_size);
                         goto out_fail;
                 }
- -              bh = sb_bread(sb, 0);
- -              if (bh == NULL) {
+ +
+ +              /* Verify that the larger boot sector is fully readable */
+ +              bh_resize = sb_bread(sb, 0);
+ +              if (bh_resize == NULL) {
                         fat_msg(sb, KERN_ERR, "unable to read boot sector"
                                " (logical sector size = %lu)",
                                sb->s_blocksize);
                         goto out_fail;
                 }
- -              b = (struct fat_boot_sector *) bh->b_data;
+ +              brelse(bh_resize);
         }
   
         mutex_init(&sbi->s_lock);
         sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus;
         sbi->cluster_bits = ffs(sbi->cluster_size) - 1;
- -      sbi->fats = b->fats;
+ +      sbi->fats = bpb.fat_fats;
         sbi->fat_bits = 0;              /* Don't know yet */
- -      sbi->fat_start = le16_to_cpu(b->reserved);
- -      sbi->fat_length = le16_to_cpu(b->fat_length);
+ +      sbi->fat_start = bpb.fat_reserved;
+ +      sbi->fat_length = bpb.fat_fat_length;
         sbi->root_cluster = 0;
         sbi->free_clusters = -1;        /* Don't know yet */
         sbi->free_clus_valid = 0;
         sbi->prev_free = FAT_START_ENT;
         sb->s_maxbytes = 0xffffffff;
   
- -      if (!sbi->fat_length && b->fat32.length) {
+ +      if (!sbi->fat_length && bpb.fat32_length) {
                 struct fat_boot_fsinfo *fsinfo;
                 struct buffer_head *fsinfo_bh;
   
                 /* Must be FAT32 */
                 sbi->fat_bits = 32;
- -              sbi->fat_length = le32_to_cpu(b->fat32.length);
- -              sbi->root_cluster = le32_to_cpu(b->fat32.root_cluster);
+ +              sbi->fat_length = bpb.fat32_length;
+ +              sbi->root_cluster = bpb.fat32_root_cluster;
   
                 /* MC - if info_sector is 0, don't multiply by 0 */
- -              sbi->fsinfo_sector = le16_to_cpu(b->fat32.info_sector);
+ +              sbi->fsinfo_sector = bpb.fat32_info_sector;
                 if (sbi->fsinfo_sector == 0)
                         sbi->fsinfo_sector = 1;
   
@@@ -1601,6 -1399,7 +1601,6 @@@
                 if (fsinfo_bh == NULL) {
                         fat_msg(sb, KERN_ERR, "bread failed, FSINFO block"
                                " (sector = %lu)", sbi->fsinfo_sector);
- -                      brelse(bh);
                         goto out_fail;
                 }
   
@@@ -1623,28 -1422,35 +1623,28 @@@
   
         /* interpret volume ID as a little endian 32 bit integer */
         if (sbi->fat_bits == 32)
- -              sbi->vol_id = (((u32)b->fat32.vol_id[0]) |
- -                                      ((u32)b->fat32.vol_id[1] << 8) |
- -                                      ((u32)b->fat32.vol_id[2] << 16) |
- -                                      ((u32)b->fat32.vol_id[3] << 24));
+ +              sbi->vol_id = bpb.fat32_vol_id;
         else /* fat 16 or 12 */
- -              sbi->vol_id = (((u32)b->fat16.vol_id[0]) |
- -                                      ((u32)b->fat16.vol_id[1] << 8) |
- -                                      ((u32)b->fat16.vol_id[2] << 16) |
- -                                      ((u32)b->fat16.vol_id[3] << 24));
+ +              sbi->vol_id = bpb.fat16_vol_id;
   
         sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
         sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
   
         sbi->dir_start = sbi->fat_start + sbi->fats * sbi->fat_length;
- -      sbi->dir_entries = get_unaligned_le16(&b->dir_entries);
+ +      sbi->dir_entries = bpb.fat_dir_entries;
         if (sbi->dir_entries & (sbi->dir_per_block - 1)) {
                 if (!silent)
                         fat_msg(sb, KERN_ERR, "bogus directory-entries per block"
                                " (%u)", sbi->dir_entries);
- -              brelse(bh);
                 goto out_invalid;
         }
   
         rootdir_sectors = sbi->dir_entries
                 * sizeof(struct msdos_dir_entry) / sb->s_blocksize;
         sbi->data_start = sbi->dir_start + rootdir_sectors;
- -      total_sectors = get_unaligned_le16(&b->sectors);
+ +      total_sectors = bpb.fat_sectors;
         if (total_sectors == 0)
- -              total_sectors = le32_to_cpu(b->total_sect);
+ +              total_sectors = bpb.fat_total_sect;
   
         total_clusters = (total_sectors - sbi->data_start) / sbi->sec_per_clus;
   
@@@ -1653,9 -1459,9 +1653,9 @@@
   
         /* some OSes set FAT_STATE_DIRTY and clean it on unmount. */
         if (sbi->fat_bits == 32)
- -              sbi->dirty = b->fat32.state & FAT_STATE_DIRTY;
+ +              sbi->dirty = bpb.fat32_state & FAT_STATE_DIRTY;
         else /* fat 16 or 12 */
- -              sbi->dirty = b->fat16.state & FAT_STATE_DIRTY;
+ +              sbi->dirty = bpb.fat16_state & FAT_STATE_DIRTY;
   
         /* check that FAT table does not overflow */
         fat_clusters = calc_fat_clusters(sb);
@@@ -1664,6 -1470,7 +1664,6 @@@
                 if (!silent)
                         fat_msg(sb, KERN_ERR, "count of clusters too big (%u)",
                                total_clusters);
- -              brelse(bh);
                 goto out_invalid;
         }
   
@@@ -1676,6 -1483,8 +1676,6 @@@
         if (sbi->prev_free < FAT_START_ENT)
                 sbi->prev_free = FAT_START_ENT;
   
- -      brelse(bh);
- -
         /* set up enough so that it can read an inode */
         fat_hash_init(sb);
         dir_hash_init(sb);
diff --combined fs/file_table.c

index 40bf4660f0a3aa18bf881c2b3c09dea95e0808ec,f8cc881fbbfb3ff7fca0ea2bc589f0ab8a9f48db..385bfd31512a17f4e4c6869a3ee8f32c456cd327
--- 1/fs/file_table.c
--- 2/fs/file_table.c
+++ b/fs/file_table.c
@@@ -76,14 -76,14 +76,14 @@@ EXPORT_SYMBOL_GPL(get_max_files)
    * Handle nr_files sysctl
    */
   #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
- -int proc_nr_files(ctl_table *table, int write,
+ +int proc_nr_files(struct ctl_table *table, int write,
                        void __user *buffer, size_t *lenp, loff_t *ppos)
   {
         files_stat.nr_files = get_nr_files();
         return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
   }
   #else
- -int proc_nr_files(ctl_table *table, int write,
+ +int proc_nr_files(struct ctl_table *table, int write,
                        void __user *buffer, size_t *lenp, loff_t *ppos)
   {
         return -ENOSYS;
@@@ -175,6 -175,12 +175,12 @@@ struct file *alloc_file(struct path *pa
         file->f_path = *path;
         file->f_inode = path->dentry->d_inode;
         file->f_mapping = path->dentry->d_inode->i_mapping;
+       if ((mode & FMODE_READ) &&
+            likely(fop->read || fop->aio_read || fop->read_iter))
+               mode |= FMODE_CAN_READ;
+       if ((mode & FMODE_WRITE) &&
+            likely(fop->write || fop->aio_write || fop->write_iter))
+               mode |= FMODE_CAN_WRITE;
         file->f_mode = mode;
         file->f_op = fop;
         if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
diff --combined fs/fuse/file.c

index 903cbc9cd6bd3a471f565e9fd3e2115539b58aca,b2dae9d1437cf36a9c61178166f9ff9f1de24ec1..6e16dad13e9b16de0358f8caaec9833d9f00a84b
--- 1/fs/fuse/file.c
--- 2/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@@ -933,8 -933,7 +933,7 @@@ out
         return err;
   }
   
- static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-                                 unsigned long nr_segs, loff_t pos)
+ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
   {
         struct inode *inode = iocb->ki_filp->f_mapping->host;
         struct fuse_conn *fc = get_fuse_conn(inode);
@@@ -945,14 -944,14 +944,14 @@@
          * i_size is up to date).
          */
         if (fc->auto_inval_data ||
-           (pos + iov_length(iov, nr_segs) > i_size_read(inode))) {
+           (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
                 int err;
                 err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
                 if (err)
                         return err;
         }
   
-       return generic_file_aio_read(iocb, iov, nr_segs, pos);
+       return generic_file_read_iter(iocb, to);
   }
   
   static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
@@@ -1089,6 -1088,8 +1088,6 @@@ static ssize_t fuse_fill_write_pages(st
                 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
                 flush_dcache_page(page);
   
- -              mark_page_accessed(page);
- -
                 if (!tmp) {
                         unlock_page(page);
                         page_cache_release(page);
@@@ -1181,19 -1182,17 +1180,17 @@@ static ssize_t fuse_perform_write(struc
         return res > 0 ? res : err;
   }
   
- static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                                  unsigned long nr_segs, loff_t pos)
+ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
   {
         struct file *file = iocb->ki_filp;
         struct address_space *mapping = file->f_mapping;
-       size_t count = 0;
-       size_t ocount = 0;
+       size_t count = iov_iter_count(from);
         ssize_t written = 0;
         ssize_t written_buffered = 0;
         struct inode *inode = mapping->host;
         ssize_t err;
-       struct iov_iter i;
         loff_t endbyte = 0;
+       loff_t pos = iocb->ki_pos;
   
         if (get_fuse_conn(inode)->writeback_cache) {
                 /* Update size (EOF optimization) and mode (SUID clearing) */
@@@ -1201,17 -1200,9 +1198,9 @@@
                 if (err)
                         return err;
   
-               return generic_file_aio_write(iocb, iov, nr_segs, pos);
+               return generic_file_write_iter(iocb, from);
         }
   
-       WARN_ON(iocb->ki_pos != pos);
- 
-       ocount = 0;
-       err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-       if (err)
-               return err;
- 
-       count = ocount;
         mutex_lock(&inode->i_mutex);
   
         /* We can write back this queue in page reclaim */
@@@ -1224,6 -1215,7 +1213,7 @@@
         if (count == 0)
                 goto out;
   
+       iov_iter_truncate(from, count);
         err = file_remove_suid(file);
         if (err)
                 goto out;
@@@ -1233,16 -1225,13 +1223,13 @@@
                 goto out;
   
         if (file->f_flags & O_DIRECT) {
-               written = generic_file_direct_write(iocb, iov, &nr_segs, pos, 
-                                                   count, ocount);
-               if (written < 0 || written == count)
+               written = generic_file_direct_write(iocb, from, pos);
+               if (written < 0 || !iov_iter_count(from))
                         goto out;
   
                 pos += written;
-               count -= written;
   
-               iov_iter_init(&i, iov, nr_segs, count, written);
-               written_buffered = fuse_perform_write(file, mapping, &i, pos);
+               written_buffered = fuse_perform_write(file, mapping, from, pos);
                 if (written_buffered < 0) {
                         err = written_buffered;
                         goto out;
@@@ -1261,8 -1250,7 +1248,7 @@@
                 written += written_buffered;
                 iocb->ki_pos = pos + written_buffered;
         } else {
-               iov_iter_init(&i, iov, nr_segs, count, 0);
-               written = fuse_perform_write(file, mapping, &i, pos);
+               written = fuse_perform_write(file, mapping, from, pos);
                 if (written >= 0)
                         iocb->ki_pos = pos + written;
         }
@@@ -1300,7 -1288,7 +1286,7 @@@ static int fuse_get_user_pages(struct f
         size_t nbytes = 0;  /* # bytes already packed in req */
   
         /* Special case for kernel I/O: can copy directly into the buffer */
-       if (segment_eq(get_fs(), KERNEL_DS)) {
+       if (ii->type & ITER_KVEC) {
                 unsigned long user_addr = fuse_get_user_addr(ii);
                 size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
   
@@@ -1316,35 -1304,26 +1302,26 @@@
   
         while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
                 unsigned npages;
-               unsigned long user_addr = fuse_get_user_addr(ii);
-               unsigned offset = user_addr & ~PAGE_MASK;
-               size_t frag_size = fuse_get_frag_size(ii, *nbytesp - nbytes);
-               int ret;
- 
+               size_t start;
                 unsigned n = req->max_pages - req->num_pages;
-               frag_size = min_t(size_t, frag_size, n << PAGE_SHIFT);
- 
-               npages = (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
-               npages = clamp(npages, 1U, n);
- 
-               ret = get_user_pages_fast(user_addr, npages, !write,
-                                         &req->pages[req->num_pages]);
+               ssize_t ret = iov_iter_get_pages(ii,
+                                       &req->pages[req->num_pages],
+                                       n * PAGE_SIZE, &start);
                 if (ret < 0)
                         return ret;
   
-               npages = ret;
-               frag_size = min_t(size_t, frag_size,
-                                 (npages << PAGE_SHIFT) - offset);
-               iov_iter_advance(ii, frag_size);
+               iov_iter_advance(ii, ret);
+               nbytes += ret;
+ 
+               ret += start;
+               npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
   
-               req->page_descs[req->num_pages].offset = offset;
+               req->page_descs[req->num_pages].offset = start;
                 fuse_page_descs_length_init(req, req->num_pages, npages);
   
                 req->num_pages += npages;
                 req->page_descs[req->num_pages - 1].length -=
-                       (npages << PAGE_SHIFT) - offset - frag_size;
- 
-               nbytes += frag_size;
+                       (PAGE_SIZE - ret) & (PAGE_SIZE - 1);
         }
   
         if (write)
@@@ -1359,24 -1338,11 +1336,11 @@@
   
   static inline int fuse_iter_npages(const struct iov_iter *ii_p)
   {
-       struct iov_iter ii = *ii_p;
-       int npages = 0;
- 
-       while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) {
-               unsigned long user_addr = fuse_get_user_addr(&ii);
-               unsigned offset = user_addr & ~PAGE_MASK;
-               size_t frag_size = iov_iter_single_seg_count(&ii);
- 
-               npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
-               iov_iter_advance(&ii, frag_size);
-       }
- 
-       return min(npages, FUSE_MAX_PAGES_PER_REQ);
+       return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ);
   }
   
- ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
-                      unsigned long nr_segs, size_t count, loff_t *ppos,
-                      int flags)
+ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
+                      loff_t *ppos, int flags)
   {
         int write = flags & FUSE_DIO_WRITE;
         int cuse = flags & FUSE_DIO_CUSE;
@@@ -1386,18 -1352,16 +1350,16 @@@
         struct fuse_conn *fc = ff->fc;
         size_t nmax = write ? fc->max_write : fc->max_read;
         loff_t pos = *ppos;
+       size_t count = iov_iter_count(iter);
         pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT;
         pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
         ssize_t res = 0;
         struct fuse_req *req;
-       struct iov_iter ii;
- 
-       iov_iter_init(&ii, iov, nr_segs, count, 0);
   
         if (io->async)
-               req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii));
+               req = fuse_get_req_for_background(fc, fuse_iter_npages(iter));
         else
-               req = fuse_get_req(fc, fuse_iter_npages(&ii));
+               req = fuse_get_req(fc, fuse_iter_npages(iter));
         if (IS_ERR(req))
                 return PTR_ERR(req);
   
@@@ -1413,7 -1377,7 +1375,7 @@@
                 size_t nres;
                 fl_owner_t owner = current->files;
                 size_t nbytes = min(count, nmax);
-               int err = fuse_get_user_pages(req, &ii, &nbytes, write);
+               int err = fuse_get_user_pages(req, iter, &nbytes, write);
                 if (err) {
                         res = err;
                         break;
@@@ -1443,9 -1407,9 +1405,9 @@@
                         fuse_put_request(fc, req);
                         if (io->async)
                                 req = fuse_get_req_for_background(fc,
-                                       fuse_iter_npages(&ii));
+                                       fuse_iter_npages(iter));
                         else
-                               req = fuse_get_req(fc, fuse_iter_npages(&ii));
+                               req = fuse_get_req(fc, fuse_iter_npages(iter));
                         if (IS_ERR(req))
                                 break;
                 }
@@@ -1460,9 -1424,8 +1422,8 @@@
   EXPORT_SYMBOL_GPL(fuse_direct_io);
   
   static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
-                                 const struct iovec *iov,
-                                 unsigned long nr_segs, loff_t *ppos,
-                                 size_t count)
+                                 struct iov_iter *iter,
+                                 loff_t *ppos)
   {
         ssize_t res;
         struct file *file = io->file;
@@@ -1471,7 -1434,7 +1432,7 @@@
         if (is_bad_inode(inode))
                 return -EIO;
   
-       res = fuse_direct_io(io, iov, nr_segs, count, ppos, 0);
+       res = fuse_direct_io(io, iter, ppos, 0);
   
         fuse_invalidate_attr(inode);
   
@@@ -1483,22 -1446,26 +1444,26 @@@ static ssize_t fuse_direct_read(struct 
   {
         struct fuse_io_priv io = { .async = 0, .file = file };
         struct iovec iov = { .iov_base = buf, .iov_len = count };
-       return __fuse_direct_read(&io, &iov, 1, ppos, count);
+       struct iov_iter ii;
+       iov_iter_init(&ii, READ, &iov, 1, count);
+       return __fuse_direct_read(&io, &ii, ppos);
   }
   
   static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
-                                  const struct iovec *iov,
-                                  unsigned long nr_segs, loff_t *ppos)
+                                  struct iov_iter *iter,
+                                  loff_t *ppos)
   {
         struct file *file = io->file;
         struct inode *inode = file_inode(file);
-       size_t count = iov_length(iov, nr_segs);
+       size_t count = iov_iter_count(iter);
         ssize_t res;
   
+ 
         res = generic_write_checks(file, ppos, &count, 0);
-       if (!res)
-               res = fuse_direct_io(io, iov, nr_segs, count, ppos,
-                                    FUSE_DIO_WRITE);
+       if (!res) {
+               iov_iter_truncate(iter, count);
+               res = fuse_direct_io(io, iter, ppos, FUSE_DIO_WRITE);
+       }
   
         fuse_invalidate_attr(inode);
   
@@@ -1512,13 -1479,15 +1477,15 @@@ static ssize_t fuse_direct_write(struc
         struct inode *inode = file_inode(file);
         ssize_t res;
         struct fuse_io_priv io = { .async = 0, .file = file };
+       struct iov_iter ii;
+       iov_iter_init(&ii, WRITE, &iov, 1, count);
   
         if (is_bad_inode(inode))
                 return -EIO;
   
         /* Don't allow parallel writes to the same file */
         mutex_lock(&inode->i_mutex);
-       res = __fuse_direct_write(&io, &iov, 1, ppos);
+       res = __fuse_direct_write(&io, &ii, ppos);
         if (res > 0)
                 fuse_write_update_size(inode, *ppos);
         mutex_unlock(&inode->i_mutex);
@@@ -2302,6 -2271,7 +2269,6 @@@ static int fuse_file_flock(struct file 
                 struct fuse_file *ff = file->private_data;
   
                 /* emulate flock with POSIX locks */
- -              fl->fl_owner = (fl_owner_t) file;
                 ff->flock = true;
                 err = fuse_setlk(file, fl, 1);
         }
@@@ -2372,7 -2342,7 +2339,7 @@@ static int fuse_ioctl_copy_user(struct 
         if (!bytes)
                 return 0;
   
-       iov_iter_init(&ii, iov, nr_segs, bytes, 0);
+       iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes);
   
         while (iov_iter_count(&ii)) {
                 struct page *page = pages[page_idx++];
@@@ -2894,8 -2864,8 +2861,8 @@@ static inline loff_t fuse_round_up(loff
   }
   
   static ssize_t
- fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-                       loff_t offset, unsigned long nr_segs)
+ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+                       loff_t offset)
   {
         ssize_t ret = 0;
         struct file *file = iocb->ki_filp;
@@@ -2904,7 -2874,7 +2871,7 @@@
         loff_t pos = 0;
         struct inode *inode;
         loff_t i_size;
-       size_t count = iov_length(iov, nr_segs);
+       size_t count = iov_iter_count(iter);
         struct fuse_io_priv *io;
   
         pos = offset;
@@@ -2919,6 -2889,7 +2886,7 @@@
                 if (offset >= i_size)
                         return 0;
                 count = min_t(loff_t, count, fuse_round_up(i_size - offset));
+               iov_iter_truncate(iter, count);
         }
   
         io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
@@@ -2948,9 -2919,9 +2916,9 @@@
                 io->async = false;
   
         if (rw == WRITE)
-               ret = __fuse_direct_write(io, iov, nr_segs, &pos);
+               ret = __fuse_direct_write(io, iter, &pos);
         else
-               ret = __fuse_direct_read(io, iov, nr_segs, &pos, count);
+               ret = __fuse_direct_read(io, iter, &pos);
   
         if (io->async) {
                 fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
@@@ -3061,10 -3032,10 +3029,10 @@@ out
   
   static const struct file_operations fuse_file_operations = {
         .llseek         = fuse_file_llseek,
-       .read           = do_sync_read,
-       .aio_read       = fuse_file_aio_read,
-       .write          = do_sync_write,
-       .aio_write      = fuse_file_aio_write,
+       .read           = new_sync_read,
+       .read_iter      = fuse_file_read_iter,
+       .write          = new_sync_write,
+       .write_iter     = fuse_file_write_iter,
         .mmap           = fuse_file_mmap,
         .open           = fuse_open,
         .flush          = fuse_flush,
diff --combined fs/gfs2/aops.c

index 492123cda64ab5d325db6a640d29d7640eeb6f10,910838951d66c375d9c7a2b38fd4a225c45245b0..805b37fed6383fc71abcb573de809ee8f3e41c53
--- 1/fs/gfs2/aops.c
--- 2/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@@ -431,7 -431,7 +431,7 @@@ static int gfs2_jdata_writepages(struc
   
         ret = gfs2_write_cache_jdata(mapping, wbc);
         if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
- -              gfs2_log_flush(sdp, ip->i_gl);
+ +              gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
                 ret = gfs2_write_cache_jdata(mapping, wbc);
         }
         return ret;
@@@ -577,6 -577,7 +577,6 @@@ int gfs2_internal_read(struct gfs2_inod
                 p = kmap_atomic(page);
                 memcpy(buf + copied, p + offset, amt);
                 kunmap_atomic(p);
- -              mark_page_accessed(page);
                 page_cache_release(page);
                 copied += amt;
                 index++;
@@@ -1040,8 -1041,7 +1040,7 @@@ static int gfs2_ok_for_dio(struct gfs2_
   
   
   static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
-                             const struct iovec *iov, loff_t offset,
-                             unsigned long nr_segs)
+                             struct iov_iter *iter, loff_t offset)
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
@@@ -1081,7 -1081,7 +1080,7 @@@
          */
         if (mapping->nrpages) {
                 loff_t lstart = offset & (PAGE_CACHE_SIZE - 1);
-               loff_t len = iov_length(iov, nr_segs);
+               loff_t len = iov_iter_count(iter);
                 loff_t end = PAGE_ALIGN(offset + len) - 1;
   
                 rv = 0;
@@@ -1096,9 -1096,9 +1095,9 @@@
                         truncate_inode_pages_range(mapping, lstart, end);
         }
   
-       rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-                                 offset, nr_segs, gfs2_get_block_direct,
-                                 NULL, NULL, 0);
+       rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+                                 iter, offset,
+                                 gfs2_get_block_direct, NULL, NULL, 0);
   out:
         gfs2_glock_dq(&gh);
         gfs2_holder_uninit(&gh);
diff --combined fs/gfs2/file.c

index 6ab0cfb2e891014436816e7ce2021a745291d2d6,01b4c5b1bff8ddd152a6b93ac3b5249d3d0a49a8..4fc3a3046174dc9a296c90a0d0ca6d53485e277b
--- 1/fs/gfs2/file.c
--- 2/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@@ -203,9 -203,9 +203,9 @@@ void gfs2_set_inode_flags(struct inode 
                              GFS2_DIF_INHERIT_JDATA)
   
   /**
- - * gfs2_set_flags - set flags on an inode
- - * @inode: The inode
- - * @flags: The flags to set
+ + * do_gfs2_set_flags - set flags on an inode
+ + * @filp: file pointer
+ + * @reqflags: The flags to set
    * @mask: Indicates which flags are valid
    *
    */
@@@ -256,7 -256,7 +256,7 @@@ static int do_gfs2_set_flags(struct fil
         }
         if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
                 if (flags & GFS2_DIF_JDATA)
- -                      gfs2_log_flush(sdp, ip->i_gl);
+ +                      gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
                 error = filemap_fdatawrite(inode->i_mapping);
                 if (error)
                         goto out;
@@@ -318,7 -318,7 +318,7 @@@ static long gfs2_ioctl(struct file *fil
   
   /**
    * gfs2_size_hint - Give a hint to the size of a write request
- - * @file: The struct file
+ + * @filep: The struct file
    * @offset: The file offset of the write
    * @size: The length of the write
    *
@@@ -371,7 -371,7 +371,7 @@@ static int gfs2_allocate_page_backing(s
   /**
    * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable
    * @vma: The virtual memory area
- - * @page: The page which is about to become writable
+ + * @vmf: The virtual memory fault containing the page to become writable
    *
    * When the page becomes writable, we need to ensure that we have
    * blocks allocated on disk to back that page.
@@@ -684,7 -684,7 +684,7 @@@ static int gfs2_fsync(struct file *file
   }
   
   /**
-  * gfs2_file_aio_write - Perform a write to a file
+  * gfs2_file_write_iter - Perform a write to a file
    * @iocb: The io context
    * @iov: The data to write
    * @nr_segs: Number of @iov segments
@@@ -697,11 -697,9 +697,9 @@@
    *
    */
   
- static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                                  unsigned long nr_segs, loff_t pos)
+ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
   {
         struct file *file = iocb->ki_filp;
-       size_t writesize = iov_length(iov, nr_segs);
         struct gfs2_inode *ip = GFS2_I(file_inode(file));
         int ret;
   
@@@ -709,7 -707,7 +707,7 @@@
         if (ret)
                 return ret;
   
-       gfs2_size_hint(file, pos, writesize);
+       gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from));
   
         if (file->f_flags & O_APPEND) {
                 struct gfs2_holder gh;
@@@ -720,7 -718,7 +718,7 @@@
                 gfs2_glock_dq_uninit(&gh);
         }
   
-       return generic_file_aio_write(iocb, iov, nr_segs, pos);
+       return generic_file_write_iter(iocb, from);
   }
   
   static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
@@@ -1058,10 -1056,10 +1056,10 @@@ static int gfs2_flock(struct file *file
   
   const struct file_operations gfs2_file_fops = {
         .llseek         = gfs2_llseek,
-       .read           = do_sync_read,
-       .aio_read       = generic_file_aio_read,
-       .write          = do_sync_write,
-       .aio_write      = gfs2_file_aio_write,
+       .read           = new_sync_read,
+       .read_iter      = generic_file_read_iter,
+       .write          = new_sync_write,
+       .write_iter     = gfs2_file_write_iter,
         .unlocked_ioctl = gfs2_ioctl,
         .mmap           = gfs2_mmap,
         .open           = gfs2_open,
@@@ -1070,7 -1068,7 +1068,7 @@@
         .lock           = gfs2_lock,
         .flock          = gfs2_flock,
         .splice_read    = generic_file_splice_read,
-       .splice_write   = generic_file_splice_write,
+       .splice_write   = iter_file_splice_write,
         .setlease       = gfs2_setlease,
         .fallocate      = gfs2_fallocate,
   };
@@@ -1090,17 -1088,17 +1088,17 @@@ const struct file_operations gfs2_dir_f
   
   const struct file_operations gfs2_file_fops_nolock = {
         .llseek         = gfs2_llseek,
-       .read           = do_sync_read,
-       .aio_read       = generic_file_aio_read,
-       .write          = do_sync_write,
-       .aio_write      = gfs2_file_aio_write,
+       .read           = new_sync_read,
+       .read_iter      = generic_file_read_iter,
+       .write          = new_sync_write,
+       .write_iter     = gfs2_file_write_iter,
         .unlocked_ioctl = gfs2_ioctl,
         .mmap           = gfs2_mmap,
         .open           = gfs2_open,
         .release        = gfs2_release,
         .fsync          = gfs2_fsync,
         .splice_read    = generic_file_splice_read,
-       .splice_write   = generic_file_splice_write,
+       .splice_write   = iter_file_splice_write,
         .setlease       = generic_setlease,
         .fallocate      = gfs2_fallocate,
   };
diff --combined fs/nfs/direct.c

index 4ad7bc3886791b0078ebc3ae4b326ed5e4c6566b,b122fe21fea0dce3ae5dbcbb362dcbb4820e73b8..8f98138cbc4385ba63b3af77ae907219d22e6991
--- 1/fs/nfs/direct.c
--- 2/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@@ -108,97 -108,6 +108,97 @@@ static inline int put_dreq(struct nfs_d
         return atomic_dec_and_test(&dreq->io_count);
   }
   
+ +/*
+ + * nfs_direct_select_verf - select the right verifier
+ + * @dreq - direct request possibly spanning multiple servers
+ + * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
+ + * @ds_idx - index of data server in data server list, only valid if ds_clp set
+ + *
+ + * returns the correct verifier to use given the role of the server
+ + */
+ +static struct nfs_writeverf *
+ +nfs_direct_select_verf(struct nfs_direct_req *dreq,
+ +                     struct nfs_client *ds_clp,
+ +                     int ds_idx)
+ +{
+ +      struct nfs_writeverf *verfp = &dreq->verf;
+ +
+ +#ifdef CONFIG_NFS_V4_1
+ +      if (ds_clp) {
+ +              /* pNFS is in use, use the DS verf */
+ +              if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets)
+ +                      verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf;
+ +              else
+ +                      WARN_ON_ONCE(1);
+ +      }
+ +#endif
+ +      return verfp;
+ +}
+ +
+ +
+ +/*
+ + * nfs_direct_set_hdr_verf - set the write/commit verifier
+ + * @dreq - direct request possibly spanning multiple servers
+ + * @hdr - pageio header to validate against previously seen verfs
+ + *
+ + * Set the server's (MDS or DS) "seen" verifier
+ + */
+ +static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
+ +                                  struct nfs_pgio_header *hdr)
+ +{
+ +      struct nfs_writeverf *verfp;
+ +
+ +      verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
+ +                                    hdr->data->ds_idx);
+ +      WARN_ON_ONCE(verfp->committed >= 0);
+ +      memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
+ +      WARN_ON_ONCE(verfp->committed < 0);
+ +}
+ +
+ +/*
+ + * nfs_direct_cmp_hdr_verf - compare verifier for pgio header
+ + * @dreq - direct request possibly spanning multiple servers
+ + * @hdr - pageio header to validate against previously seen verf
+ + *
+ + * set the server's "seen" verf if not initialized.
+ + * returns result of comparison between @hdr->verf and the "seen"
+ + * verf of the server used by @hdr (DS or MDS)
+ + */
+ +static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
+ +                                        struct nfs_pgio_header *hdr)
+ +{
+ +      struct nfs_writeverf *verfp;
+ +
+ +      verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
+ +                                       hdr->data->ds_idx);
+ +      if (verfp->committed < 0) {
+ +              nfs_direct_set_hdr_verf(dreq, hdr);
+ +              return 0;
+ +      }
+ +      return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
+ +}
+ +
+ +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
+ +/*
+ + * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
+ + * @dreq - direct request possibly spanning multiple servers
+ + * @data - commit data to validate against previously seen verf
+ + *
+ + * returns result of comparison between @data->verf and the verf of
+ + * the server used by @data (DS or MDS)
+ + */
+ +static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
+ +                                         struct nfs_commit_data *data)
+ +{
+ +      struct nfs_writeverf *verfp;
+ +
+ +      verfp = nfs_direct_select_verf(dreq, data->ds_clp,
+ +                                       data->ds_commit_index);
+ +      WARN_ON_ONCE(verfp->committed < 0);
+ +      return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
+ +}
+ +#endif
+ +
   /**
    * nfs_direct_IO - NFS address space operation for direct I/O
    * @rw: direction (read or write)
@@@ -212,20 -121,20 +212,20 @@@
    * shunt off direct read and write requests before the VFS gets them,
    * so this method is only ever called for swap.
    */
- ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
+ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
   {
   #ifndef CONFIG_NFS_SWAP
         dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n",
-                       iocb->ki_filp, (long long) pos, nr_segs);
+                       iocb->ki_filp, (long long) pos, iter->nr_segs);
   
         return -EINVAL;
   #else
         VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
   
         if (rw == READ || rw == KERNEL_READ)
-               return nfs_file_direct_read(iocb, iov, nr_segs, pos,
+               return nfs_file_direct_read(iocb, iter, pos,
                                 rw == READ ? true : false);
-       return nfs_file_direct_write(iocb, iov, nr_segs, pos,
+       return nfs_file_direct_write(iocb, iter, pos,
                                 rw == WRITE ? true : false);
   #endif /* CONFIG_NFS_SWAP */
   }
@@@ -259,7 -168,6 +259,7 @@@ static inline struct nfs_direct_req *nf
         kref_get(&dreq->kref);
         init_completion(&dreq->completion);
         INIT_LIST_HEAD(&dreq->mds_cinfo.list);
+ +      dreq->verf.committed = NFS_INVALID_STABLE_HOW;  /* not set yet */
         INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
         spin_lock_init(&dreq->lock);
   
@@@ -414,65 -322,43 +414,42 @@@ static const struct nfs_pgio_completion
    * handled automatically by nfs_direct_read_result().  Otherwise, if
    * no requests have been sent, just return an error.
    */
- static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc,
-                                               const struct iovec *iov,
-                                               loff_t pos, bool uio)
- {
-       struct nfs_direct_req *dreq = desc->pg_dreq;
-       struct nfs_open_context *ctx = dreq->ctx;
-       struct inode *inode = ctx->dentry->d_inode;
-       unsigned long user_addr = (unsigned long)iov->iov_base;
-       size_t count = iov->iov_len;
-       size_t rsize = NFS_SERVER(inode)->rsize;
-       unsigned int pgbase;
-       int result;
-       ssize_t started = 0;
-       struct page **pagevec = NULL;
-       unsigned int npages;
- 
-       do {
-               size_t bytes;
-               int i;
   
-               pgbase = user_addr & ~PAGE_MASK;
-               bytes = min(max_t(size_t, rsize, PAGE_SIZE), count);
+ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
+                                             struct iov_iter *iter,
+                                             loff_t pos)
+ {
+       struct nfs_pageio_descriptor desc;
+       struct inode *inode = dreq->inode;
+       ssize_t result = -EINVAL;
+       size_t requested_bytes = 0;
+       size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
   
-               result = -ENOMEM;
-               npages = nfs_page_array_len(pgbase, bytes);
-               if (!pagevec)
-                       pagevec = kmalloc(npages * sizeof(struct page *),
-                                         GFP_KERNEL);
-               if (!pagevec)
-                       break;
-               if (uio) {
-                       down_read(&current->mm->mmap_sem);
-                       result = get_user_pages(current, current->mm, user_addr,
-                                       npages, 1, 0, pagevec, NULL);
-                       up_read(&current->mm->mmap_sem);
-                       if (result < 0)
-                               break;
-               } else {
-                       WARN_ON(npages != 1);
-                       result = get_kernel_page(user_addr, 1, pagevec);
-                       if (WARN_ON(result != 1))
-                               break;
-               }
- -      NFS_PROTO(dreq->inode)->read_pageio_init(&desc, dreq->inode,
++      nfs_pageio_init_read(&desc, dreq->inode, false,
+                            &nfs_direct_read_completion_ops);
+       get_dreq(dreq);
+       desc.pg_dreq = dreq;
+       atomic_inc(&inode->i_dio_count);
   
-               if ((unsigned)result < npages) {
-                       bytes = result * PAGE_SIZE;
-                       if (bytes <= pgbase) {
-                               nfs_direct_release_pages(pagevec, result);
-                               break;
-                       }
-                       bytes -= pgbase;
-                       npages = result;
-               }
+       while (iov_iter_count(iter)) {
+               struct page **pagevec;
+               size_t bytes;
+               size_t pgbase;
+               unsigned npages, i;
   
+               result = iov_iter_get_pages_alloc(iter, &pagevec, 
+                                                 rsize, &pgbase);
+               if (result < 0)
+                       break;
+       
+               bytes = result;
+               iov_iter_advance(iter, bytes);
+               npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
                 for (i = 0; i < npages; i++) {
                         struct nfs_page *req;
                         unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
                         /* XXX do we need to do the eof zeroing found in async_filler? */
- -                      req = nfs_create_request(dreq->ctx, dreq->inode,
- -                                               pagevec[i],
+ +                      req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
                                                  pgbase, req_len);
                         if (IS_ERR(req)) {
                                 result = PTR_ERR(req);
@@@ -480,56 -366,21 +457,21 @@@
                         }
                         req->wb_index = pos >> PAGE_SHIFT;
                         req->wb_offset = pos & ~PAGE_MASK;
-                       if (!nfs_pageio_add_request(desc, req)) {
-                               result = desc->pg_error;
+                       if (!nfs_pageio_add_request(&desc, req)) {
+                               result = desc.pg_error;
                                 nfs_release_request(req);
                                 break;
                         }
                         pgbase = 0;
                         bytes -= req_len;
-                       started += req_len;
-                       user_addr += req_len;
+                       requested_bytes += req_len;
                         pos += req_len;
-                       count -= req_len;
                         dreq->bytes_left -= req_len;
                 }
-               /* The nfs_page now hold references to these pages */
                 nfs_direct_release_pages(pagevec, npages);
-       } while (count != 0 && result >= 0);
- 
-       kfree(pagevec);
- 
-       if (started)
-               return started;
-       return result < 0 ? (ssize_t) result : -EFAULT;
- }
- 
- static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
-                                             const struct iovec *iov,
-                                             unsigned long nr_segs,
-                                             loff_t pos, bool uio)
- {
-       struct nfs_pageio_descriptor desc;
-       struct inode *inode = dreq->inode;
-       ssize_t result = -EINVAL;
-       size_t requested_bytes = 0;
-       unsigned long seg;
- 
-       nfs_pageio_init_read(&desc, dreq->inode, false,
-                            &nfs_direct_read_completion_ops);
-       get_dreq(dreq);
-       desc.pg_dreq = dreq;
-       atomic_inc(&inode->i_dio_count);
- 
-       for (seg = 0; seg < nr_segs; seg++) {
-               const struct iovec *vec = &iov[seg];
-               result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio);
+               kvfree(pagevec);
                 if (result < 0)
                         break;
-               requested_bytes += result;
-               if ((size_t)result < vec->iov_len)
-                       break;
-               pos += vec->iov_len;
         }
   
         nfs_pageio_complete(&desc);
@@@ -552,8 -403,7 +494,7 @@@
   /**
    * nfs_file_direct_read - file direct read operation for NFS files
    * @iocb: target I/O control block
-  * @iov: vector of user buffers into which to read data
-  * @nr_segs: size of iov vector
+  * @iter: vector of user buffers into which to read data
    * @pos: byte offset in file where reading starts
    *
    * We use this function for direct reads instead of calling
@@@ -570,8 -420,8 +511,8 @@@
    * client must read the updated atime from the server back into its
    * cache.
    */
- ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
-                               unsigned long nr_segs, loff_t pos, bool uio)
+ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
+                               loff_t pos, bool uio)
   {
         struct file *file = iocb->ki_filp;
         struct address_space *mapping = file->f_mapping;
@@@ -579,9 -429,7 +520,7 @@@
         struct nfs_direct_req *dreq;
         struct nfs_lock_context *l_ctx;
         ssize_t result = -EINVAL;
-       size_t count;
- 
-       count = iov_length(iov, nr_segs);
+       size_t count = iov_iter_count(iter);
         nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
   
         dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
@@@ -604,7 -452,7 +543,7 @@@
                 goto out_unlock;
   
         dreq->inode = inode;
-       dreq->bytes_left = iov_length(iov, nr_segs);
+       dreq->bytes_left = count;
         dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
         l_ctx = nfs_get_lock_context(dreq->ctx);
         if (IS_ERR(l_ctx)) {
@@@ -615,8 -463,8 +554,8 @@@
         if (!is_sync_kiocb(iocb))
                 dreq->iocb = iocb;
   
-       NFS_I(inode)->read_io += iov_length(iov, nr_segs);
-       result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+       NFS_I(inode)->read_io += count;
+       result = nfs_direct_read_schedule_iovec(dreq, iter, pos);
   
         mutex_unlock(&inode->i_mutex);
   
@@@ -655,7 -503,7 +594,7 @@@ static void nfs_direct_write_reschedule
         dreq->count = 0;
         get_dreq(dreq);
   
- -      NFS_PROTO(dreq->inode)->write_pageio_init(&desc, dreq->inode, FLUSH_STABLE,
+ +      nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
                               &nfs_direct_write_completion_ops);
         desc.pg_dreq = dreq;
   
@@@ -694,7 -542,7 +633,7 @@@ static void nfs_direct_commit_complete(
                 dprintk("NFS: %5u commit failed with error %d.\n",
                         data->task.tk_pid, status);
                 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
- -      } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
+ +      } else if (nfs_direct_cmp_commit_data_verf(dreq, data)) {
                 dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
                 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
         }
@@@ -772,108 -620,6 +711,6 @@@ static void nfs_direct_write_complete(s
   }
   #endif
   
- /*
-  * NB: Return the value of the first error return code.  Subsequent
-  *     errors after the first one are ignored.
-  */
- /*
-  * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
-  * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
-  * bail and stop sending more writes.  Write length accounting is
-  * handled automatically by nfs_direct_write_result().  Otherwise, if
-  * no requests have been sent, just return an error.
-  */
- static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc,
-                                                const struct iovec *iov,
-                                                loff_t pos, bool uio)
- {
-       struct nfs_direct_req *dreq = desc->pg_dreq;
-       struct nfs_open_context *ctx = dreq->ctx;
-       struct inode *inode = ctx->dentry->d_inode;
-       unsigned long user_addr = (unsigned long)iov->iov_base;
-       size_t count = iov->iov_len;
-       size_t wsize = NFS_SERVER(inode)->wsize;
-       unsigned int pgbase;
-       int result;
-       ssize_t started = 0;
-       struct page **pagevec = NULL;
-       unsigned int npages;
- 
-       do {
-               size_t bytes;
-               int i;
- 
-               pgbase = user_addr & ~PAGE_MASK;
-               bytes = min(max_t(size_t, wsize, PAGE_SIZE), count);
- 
-               result = -ENOMEM;
-               npages = nfs_page_array_len(pgbase, bytes);
-               if (!pagevec)
-                       pagevec = kmalloc(npages * sizeof(struct page *), GFP_KERNEL);
-               if (!pagevec)
-                       break;
- 
-               if (uio) {
-                       down_read(&current->mm->mmap_sem);
-                       result = get_user_pages(current, current->mm, user_addr,
-                                               npages, 0, 0, pagevec, NULL);
-                       up_read(&current->mm->mmap_sem);
-                       if (result < 0)
-                               break;
-               } else {
-                       WARN_ON(npages != 1);
-                       result = get_kernel_page(user_addr, 0, pagevec);
-                       if (WARN_ON(result != 1))
-                               break;
-               }
- 
-               if ((unsigned)result < npages) {
-                       bytes = result * PAGE_SIZE;
-                       if (bytes <= pgbase) {
-                               nfs_direct_release_pages(pagevec, result);
-                               break;
-                       }
-                       bytes -= pgbase;
-                       npages = result;
-               }
- 
-               for (i = 0; i < npages; i++) {
-                       struct nfs_page *req;
-                       unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
- 
-                       req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
-                                                pgbase, req_len);
-                       if (IS_ERR(req)) {
-                               result = PTR_ERR(req);
-                               break;
-                       }
-                       nfs_lock_request(req);
-                       req->wb_index = pos >> PAGE_SHIFT;
-                       req->wb_offset = pos & ~PAGE_MASK;
-                       if (!nfs_pageio_add_request(desc, req)) {
-                               result = desc->pg_error;
-                               nfs_unlock_and_release_request(req);
-                               break;
-                       }
-                       pgbase = 0;
-                       bytes -= req_len;
-                       started += req_len;
-                       user_addr += req_len;
-                       pos += req_len;
-                       count -= req_len;
-                       dreq->bytes_left -= req_len;
-               }
-               /* The nfs_page now hold references to these pages */
-               nfs_direct_release_pages(pagevec, npages);
-       } while (count != 0 && result >= 0);
- 
-       kfree(pagevec);
- 
-       if (started)
-               return started;
-       return result < 0 ? (ssize_t) result : -EFAULT;
- }
- 
   static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
   {
         struct nfs_direct_req *dreq = hdr->dreq;
@@@ -903,13 -649,13 +740,13 @@@
                         if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
                                 bit = NFS_IOHDR_NEED_RESCHED;
                         else if (dreq->flags == 0) {
- -                              memcpy(&dreq->verf, hdr->verf,
- -                                     sizeof(dreq->verf));
+ +                              nfs_direct_set_hdr_verf(dreq, hdr);
                                 bit = NFS_IOHDR_NEED_COMMIT;
                                 dreq->flags = NFS_ODIRECT_DO_COMMIT;
                         } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
- -                              if (memcmp(&dreq->verf, hdr->verf, sizeof(dreq->verf))) {
- -                                      dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ +                              if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) {
+ +                                      dreq->flags =
+ +                                              NFS_ODIRECT_RESCHED_WRITES;
                                         bit = NFS_IOHDR_NEED_RESCHED;
                                 } else
                                         bit = NFS_IOHDR_NEED_COMMIT;
@@@ -919,8 -665,6 +756,8 @@@
         spin_unlock(&dreq->lock);
   
         while (!list_empty(&hdr->pages)) {
+ +              bool do_destroy = true;
+ +
                 req = nfs_list_entry(hdr->pages.next);
                 nfs_list_remove_request(req);
                 switch (bit) {
@@@ -928,7 -672,6 +765,7 @@@
                 case NFS_IOHDR_NEED_COMMIT:
                         kref_get(&req->wb_kref);
                         nfs_mark_request_commit(req, hdr->lseg, &cinfo);
+ +                      do_destroy = false;
                 }
                 nfs_unlock_and_release_request(req);
         }
@@@ -956,33 -699,78 +793,77 @@@ static const struct nfs_pgio_completion
         .completion = nfs_direct_write_completion,
   };
   
+ 
+ /*
+  * NB: Return the value of the first error return code.  Subsequent
+  *     errors after the first one are ignored.
+  */
+ /*
+  * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
+  * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
+  * bail and stop sending more writes.  Write length accounting is
+  * handled automatically by nfs_direct_write_result().  Otherwise, if
+  * no requests have been sent, just return an error.
+  */
   static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
-                                              const struct iovec *iov,
-                                              unsigned long nr_segs,
-                                              loff_t pos, bool uio)
+                                              struct iov_iter *iter,
+                                              loff_t pos)
   {
         struct nfs_pageio_descriptor desc;
         struct inode *inode = dreq->inode;
         ssize_t result = 0;
         size_t requested_bytes = 0;
-       unsigned long seg;
+       size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
   
- -      NFS_PROTO(inode)->write_pageio_init(&desc, inode, FLUSH_COND_STABLE,
+ +      nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false,
                               &nfs_direct_write_completion_ops);
         desc.pg_dreq = dreq;
         get_dreq(dreq);
         atomic_inc(&inode->i_dio_count);
   
-       NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs);
-       for (seg = 0; seg < nr_segs; seg++) {
-               const struct iovec *vec = &iov[seg];
-               result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio);
+       NFS_I(inode)->write_io += iov_iter_count(iter);
+       while (iov_iter_count(iter)) {
+               struct page **pagevec;
+               size_t bytes;
+               size_t pgbase;
+               unsigned npages, i;
+ 
+               result = iov_iter_get_pages_alloc(iter, &pagevec, 
+                                                 wsize, &pgbase);
                 if (result < 0)
                         break;
-               requested_bytes += result;
-               if ((size_t)result < vec->iov_len)
+ 
+               bytes = result;
+               iov_iter_advance(iter, bytes);
+               npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
+               for (i = 0; i < npages; i++) {
+                       struct nfs_page *req;
+                       unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
+ 
- -                      req = nfs_create_request(dreq->ctx, inode,
- -                                               pagevec[i],
++                      req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
+                                                pgbase, req_len);
+                       if (IS_ERR(req)) {
+                               result = PTR_ERR(req);
+                               break;
+                       }
+                       nfs_lock_request(req);
+                       req->wb_index = pos >> PAGE_SHIFT;
+                       req->wb_offset = pos & ~PAGE_MASK;
+                       if (!nfs_pageio_add_request(&desc, req)) {
+                               result = desc.pg_error;
+                               nfs_unlock_and_release_request(req);
+                               break;
+                       }
+                       pgbase = 0;
+                       bytes -= req_len;
+                       requested_bytes += req_len;
+                       pos += req_len;
+                       dreq->bytes_left -= req_len;
+               }
+               nfs_direct_release_pages(pagevec, npages);
+               kvfree(pagevec);
+               if (result < 0)
                         break;
-               pos += vec->iov_len;
         }
         nfs_pageio_complete(&desc);
   
@@@ -1004,8 -792,7 +885,7 @@@
   /**
    * nfs_file_direct_write - file direct write operation for NFS files
    * @iocb: target I/O control block
-  * @iov: vector of user buffers from which to write data
-  * @nr_segs: size of iov vector
+  * @iter: vector of user buffers from which to write data
    * @pos: byte offset in file where writing starts
    *
    * We use this function for direct writes instead of calling
@@@ -1023,8 -810,8 +903,8 @@@
    * Note that O_APPEND is not supported for NFS direct writes, as there
    * is no atomic O_APPEND write facility in the NFS protocol.
    */
- ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
-                               unsigned long nr_segs, loff_t pos, bool uio)
+ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
+                               loff_t pos, bool uio)
   {
         ssize_t result = -EINVAL;
         struct file *file = iocb->ki_filp;
@@@ -1033,9 -820,7 +913,7 @@@
         struct nfs_direct_req *dreq;
         struct nfs_lock_context *l_ctx;
         loff_t end;
-       size_t count;
- 
-       count = iov_length(iov, nr_segs);
+       size_t count = iov_iter_count(iter);
         end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
   
         nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
@@@ -1086,7 -871,7 +964,7 @@@
         if (!is_sync_kiocb(iocb))
                 dreq->iocb = iocb;
   
-       result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+       result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
   
         if (mapping->nrpages) {
                 invalidate_inode_pages2_range(mapping,
diff --combined fs/nfs/file.c

index c1edf7336315c3f8ddffe45261d814b8f1877771,f4ae5d0525e25e38cb4bffdd8a2d15c911247e60..4042ff58fe3f3d0b18d705774c3f6d975e642248
--- 1/fs/nfs/file.c
--- 2/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@@ -165,22 -165,21 +165,21 @@@ nfs_file_flush(struct file *file, fl_ow
   EXPORT_SYMBOL_GPL(nfs_file_flush);
   
   ssize_t
- nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
-               unsigned long nr_segs, loff_t pos)
+ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
   {
         struct inode *inode = file_inode(iocb->ki_filp);
         ssize_t result;
   
         if (iocb->ki_filp->f_flags & O_DIRECT)
-               return nfs_file_direct_read(iocb, iov, nr_segs, pos, true);
+               return nfs_file_direct_read(iocb, to, iocb->ki_pos, true);
   
-       dprintk("NFS: read(%pD2, %lu@%lu)\n",
+       dprintk("NFS: read(%pD2, %zu@%lu)\n",
                 iocb->ki_filp,
-               (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
+               iov_iter_count(to), (unsigned long) iocb->ki_pos);
   
         result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
         if (!result) {
-               result = generic_file_aio_read(iocb, iov, nr_segs, pos);
+               result = generic_file_read_iter(iocb, to);
                 if (result > 0)
                         nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
         }
@@@ -635,24 -634,24 +634,24 @@@ static int nfs_need_sync_write(struct f
         return 0;
   }
   
- ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
-                      unsigned long nr_segs, loff_t pos)
+ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file_inode(file);
         unsigned long written = 0;
         ssize_t result;
-       size_t count = iov_length(iov, nr_segs);
+       size_t count = iov_iter_count(from);
+       loff_t pos = iocb->ki_pos;
   
         result = nfs_key_timeout_notify(file, inode);
         if (result)
                 return result;
   
         if (file->f_flags & O_DIRECT)
-               return nfs_file_direct_write(iocb, iov, nr_segs, pos, true);
+               return nfs_file_direct_write(iocb, from, pos, true);
   
-       dprintk("NFS: write(%pD2, %lu@%Ld)\n",
-               file, (unsigned long) count, (long long) pos);
+       dprintk("NFS: write(%pD2, %zu@%Ld)\n",
+               file, count, (long long) pos);
   
         result = -EBUSY;
         if (IS_SWAPFILE(inode))
@@@ -670,7 -669,7 +669,7 @@@
         if (!count)
                 goto out;
   
-       result = generic_file_aio_write(iocb, iov, nr_segs, pos);
+       result = generic_file_write_iter(iocb, from);
         if (result > 0)
                 written = result;
   
@@@ -691,36 -690,6 +690,6 @@@ out_swapfile
   }
   EXPORT_SYMBOL_GPL(nfs_file_write);
   
- ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
-                             struct file *filp, loff_t *ppos,
-                             size_t count, unsigned int flags)
- {
-       struct inode *inode = file_inode(filp);
-       unsigned long written = 0;
-       ssize_t ret;
- 
-       dprintk("NFS splice_write(%pD2, %lu@%llu)\n",
-               filp, (unsigned long) count, (unsigned long long) *ppos);
- 
-       /*
-        * The combination of splice and an O_APPEND destination is disallowed.
-        */
- 
-       ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
-       if (ret > 0)
-               written = ret;
- 
-       if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
-               int err = vfs_fsync(filp, 0);
-               if (err < 0)
-                       ret = err;
-       }
-       if (ret > 0)
-               nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
-       return ret;
- }
- EXPORT_SYMBOL_GPL(nfs_file_splice_write);
- 
   static int
   do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
   {
@@@ -916,6 -885,10 +885,6 @@@ int nfs_flock(struct file *filp, int cm
                 is_local = 1;
   
         /* We're simulating flock() locks using posix locks on the server */
- -      fl->fl_owner = (fl_owner_t)filp;
- -      fl->fl_start = 0;
- -      fl->fl_end = OFFSET_MAX;
- -
         if (fl->fl_type == F_UNLCK)
                 return do_unlk(filp, cmd, fl, is_local);
         return do_setlk(filp, cmd, fl, is_local);
@@@ -935,10 -908,10 +904,10 @@@ EXPORT_SYMBOL_GPL(nfs_setlease)
   
   const struct file_operations nfs_file_operations = {
         .llseek         = nfs_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = nfs_file_read,
-       .aio_write      = nfs_file_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = nfs_file_read,
+       .write_iter     = nfs_file_write,
         .mmap           = nfs_file_mmap,
         .open           = nfs_file_open,
         .flush          = nfs_file_flush,
@@@ -947,7 -920,7 +916,7 @@@
         .lock           = nfs_lock,
         .flock          = nfs_flock,
         .splice_read    = nfs_file_splice_read,
-       .splice_write   = nfs_file_splice_write,
+       .splice_write   = iter_file_splice_write,
         .check_flags    = nfs_check_flags,
         .setlease       = nfs_setlease,
   };
diff --combined fs/nfs/internal.h

index 8b69cba1bb04d9b177bca18a2f95c7b0162b8cf1,0e4e8049c9f5318bed90a4ed939398fa9e47cdf8..82ddbf46660e3c1be7d499f2ca014ce619da8603
--- 1/fs/nfs/internal.h
--- 2/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@@ -231,20 -231,13 +231,20 @@@ extern void nfs_destroy_writepagecache(
   
   extern int __init nfs_init_directcache(void);
   extern void nfs_destroy_directcache(void);
- -extern bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount);
   extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
                               struct nfs_pgio_header *hdr,
                               void (*release)(struct nfs_pgio_header *hdr));
   void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
   int nfs_iocounter_wait(struct nfs_io_counter *c);
   
+ +extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
+ +struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *);
+ +void nfs_rw_header_free(struct nfs_pgio_header *);
+ +void nfs_pgio_data_release(struct nfs_pgio_data *);
+ +int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
+ +int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *,
+ +                    const struct rpc_call_ops *, int, int);
+ +
   static inline void nfs_iocounter_init(struct nfs_io_counter *c)
   {
         c->flags = 0;
@@@ -327,16 -320,14 +327,14 @@@ int nfs_rename(struct inode *, struct d
   int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
   loff_t nfs_file_llseek(struct file *, loff_t, int);
   int nfs_file_flush(struct file *, fl_owner_t);
- ssize_t nfs_file_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
   ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
                              size_t, unsigned int);
   int nfs_file_mmap(struct file *, struct vm_area_struct *);
- ssize_t nfs_file_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ ssize_t nfs_file_write(struct kiocb *, struct iov_iter *);
   int nfs_file_release(struct inode *, struct file *);
   int nfs_lock(struct file *, int, struct file_lock *);
   int nfs_flock(struct file *, int, struct file_lock *);
- ssize_t nfs_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *,
-                             size_t, unsigned int);
   int nfs_check_flags(int);
   int nfs_setlease(struct file *, long, struct file_lock **);
   
@@@ -402,11 -393,19 +400,11 @@@ extern int nfs4_get_rootfh(struct nfs_s
   
   struct nfs_pgio_completion_ops;
   /* read.c */
- -extern struct nfs_read_header *nfs_readhdr_alloc(void);
- -extern void nfs_readhdr_free(struct nfs_pgio_header *hdr);
   extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
- -                      struct inode *inode,
+ +                      struct inode *inode, bool force_mds,
                         const struct nfs_pgio_completion_ops *compl_ops);
- -extern int nfs_initiate_read(struct rpc_clnt *clnt,
- -                           struct nfs_read_data *data,
- -                           const struct rpc_call_ops *call_ops, int flags);
   extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
- -extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
- -                            struct nfs_pgio_header *hdr);
   extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
- -extern void nfs_readdata_release(struct nfs_read_data *rdata);
   
   /* super.c */
   void nfs_clone_super(struct super_block *, struct nfs_mount_info *);
@@@ -421,10 -420,19 +419,10 @@@ int nfs_remount(struct super_block *sb
   
   /* write.c */
   extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
- -                      struct inode *inode, int ioflags,
+ +                      struct inode *inode, int ioflags, bool force_mds,
                         const struct nfs_pgio_completion_ops *compl_ops);
- -extern struct nfs_write_header *nfs_writehdr_alloc(void);
- -extern void nfs_writehdr_free(struct nfs_pgio_header *hdr);
- -extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
- -                           struct nfs_pgio_header *hdr);
   extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
- -extern void nfs_writedata_release(struct nfs_write_data *wdata);
   extern void nfs_commit_free(struct nfs_commit_data *p);
- -extern int nfs_initiate_write(struct rpc_clnt *clnt,
- -                            struct nfs_write_data *data,
- -                            const struct rpc_call_ops *call_ops,
- -                            int how, int flags);
   extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
   extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
   extern int nfs_initiate_commit(struct rpc_clnt *clnt,
@@@ -437,7 -445,6 +435,7 @@@ extern void nfs_init_commit(struct nfs_
                             struct nfs_commit_info *cinfo);
   int nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
                          struct nfs_commit_info *cinfo, int max);
+ +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *);
   int nfs_scan_commit(struct inode *inode, struct list_head *dst,
                     struct nfs_commit_info *cinfo);
   void nfs_mark_request_commit(struct nfs_page *req,
@@@ -483,7 -490,7 +481,7 @@@ static inline void nfs_inode_dio_wait(s
   extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
   
   /* nfs4proc.c */
- -extern void __nfs4_read_done_cb(struct nfs_read_data *);
+ +extern void __nfs4_read_done_cb(struct nfs_pgio_data *);
   extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
                             const struct rpc_timeout *timeparms,
                             const char *ip_addr);
diff --combined fs/nfs/nfs4file.c

index 464db9dd63180dc7baf3695f51471747426144fb,50de2cdea082580e1020903d2aa7e1a06644704f..a816f0627a6ce03cda2502c42c780c5ab6a2742c
--- 1/fs/nfs/nfs4file.c
--- 2/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@@ -100,7 -100,8 +100,7 @@@ nfs4_file_fsync(struct file *file, loff
                         break;
                 mutex_lock(&inode->i_mutex);
                 ret = nfs_file_fsync_commit(file, start, end, datasync);
- -              if (!ret && !datasync)
- -                      /* application has asked for meta-data sync */
+ +              if (!ret)
                         ret = pnfs_layoutcommit_inode(inode, true);
                 mutex_unlock(&inode->i_mutex);
                 /*
@@@ -117,10 -118,10 +117,10 @@@
   
   const struct file_operations nfs4_file_operations = {
         .llseek         = nfs_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = nfs_file_read,
-       .aio_write      = nfs_file_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = nfs_file_read,
+       .write_iter     = nfs_file_write,
         .mmap           = nfs_file_mmap,
         .open           = nfs4_file_open,
         .flush          = nfs_file_flush,
@@@ -129,7 -130,7 +129,7 @@@
         .lock           = nfs_lock,
         .flock          = nfs_flock,
         .splice_read    = nfs_file_splice_read,
-       .splice_write   = nfs_file_splice_write,
+       .splice_write   = iter_file_splice_write,
         .check_flags    = nfs_check_flags,
         .setlease       = nfs_setlease,
   };
diff --combined fs/ntfs/file.c

index 86ddab916b6607e3cab28c276359b8b98971a46c,89b4d6663775276b2a0229026b6b19bff46f2a27..5c9e2c81cb11db029ece7873766041ada8c65024
--- 1/fs/ntfs/file.c
--- 2/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@@ -2060,6 -2060,7 +2060,6 @@@ static ssize_t ntfs_file_buffered_write
                 }
                 do {
                         unlock_page(pages[--do_pages]);
- -                      mark_page_accessed(pages[do_pages]);
                         page_cache_release(pages[do_pages]);
                 } while (do_pages);
                 if (unlikely(status))
@@@ -2090,10 -2091,7 +2090,7 @@@ static ssize_t ntfs_file_aio_write_nolo
         size_t count;           /* after file limit checks */
         ssize_t written, err;
   
-       count = 0;
-       err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
-       if (err)
-               return err;
+       count = iov_length(iov, nr_segs);
         pos = *ppos;
         /* We can write back this queue in page reclaim. */
         current->backing_dev_info = mapping->backing_dev_info;
@@@ -2202,8 -2200,8 +2199,8 @@@ static int ntfs_file_fsync(struct file 
   
   const struct file_operations ntfs_file_ops = {
         .llseek         = generic_file_llseek,   /* Seek inside file. */
-       .read           = do_sync_read,          /* Read from file. */
-       .aio_read       = generic_file_aio_read, /* Async read from file. */
+       .read           = new_sync_read,         /* Read from file. */
+       .read_iter      = generic_file_read_iter, /* Async read from file. */
   #ifdef NTFS_RW
         .write          = do_sync_write,         /* Write to file. */
         .aio_write      = ntfs_file_aio_write,   /* Async write to file. */
diff --combined fs/ocfs2/file.c

index 8eb6e5732d3b73b115abea0681cdd95d9195becf,465c95016a39abed758984f0d360dae8c0396193..2930e231f3f9fbda2807190726d6ceffffa2a6b5
--- 1/fs/ocfs2/file.c
--- 2/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@@ -828,7 -828,7 +828,7 @@@ static int ocfs2_write_zero_page(struc
                 /*
                  * fs-writeback will release the dirty pages without page lock
                  * whose offset are over inode size, the release happens at
- -               * block_write_full_page_endio().
+ +               * block_write_full_page().
                  */
                 i_size_write(inode, abs_to);
                 inode->i_blocks = ocfs2_inode_sector_count(inode);
@@@ -2233,16 -2233,13 +2233,13 @@@ out
         return ret;
   }
   
- static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
-                                   const struct iovec *iov,
-                                   unsigned long nr_segs,
-                                   loff_t pos)
+ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
+                                   struct iov_iter *from)
   {
         int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
         int can_do_direct, has_refcount = 0;
         ssize_t written = 0;
-       size_t ocount;          /* original count */
-       size_t count;           /* after file limit checks */
+       size_t count = iov_iter_count(from);
         loff_t old_size, *ppos = &iocb->ki_pos;
         u32 old_clusters;
         struct file *file = iocb->ki_filp;
@@@ -2256,7 -2253,7 +2253,7 @@@
                 (unsigned long long)OCFS2_I(inode)->ip_blkno,
                 file->f_path.dentry->d_name.len,
                 file->f_path.dentry->d_name.name,
-               (unsigned int)nr_segs);
+               (unsigned int)from->nr_segs);   /* GRRRRR */
   
         if (iocb->ki_nbytes == 0)
                 return 0;
@@@ -2354,29 -2351,21 +2351,21 @@@ relock
         /* communicate with ocfs2_dio_end_io */
         ocfs2_iocb_set_rw_locked(iocb, rw_level);
   
-       ret = generic_segment_checks(iov, &nr_segs, &ocount,
-                                    VERIFY_READ);
-       if (ret)
-               goto out_dio;
- 
-       count = ocount;
         ret = generic_write_checks(file, ppos, &count,
                                    S_ISBLK(inode->i_mode));
         if (ret)
                 goto out_dio;
   
+       iov_iter_truncate(from, count);
         if (direct_io) {
-               written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
-                                                   count, ocount);
+               written = generic_file_direct_write(iocb, from, *ppos);
                 if (written < 0) {
                         ret = written;
                         goto out_dio;
                 }
         } else {
-               struct iov_iter from;
-               iov_iter_init(&from, iov, nr_segs, count, 0);
                 current->backing_dev_info = file->f_mapping->backing_dev_info;
-               written = generic_perform_write(file, &from, *ppos);
+               written = generic_perform_write(file, from, *ppos);
                 if (likely(written >= 0))
                         iocb->ki_pos = *ppos + written;
                 current->backing_dev_info = NULL;
@@@ -2441,84 -2430,6 +2430,6 @@@ out_sems
         return ret;
   }
   
- static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
-                               struct file *out,
-                               struct splice_desc *sd)
- {
-       int ret;
- 
-       ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
-                                           sd->total_len, 0, NULL, NULL);
-       if (ret < 0) {
-               mlog_errno(ret);
-               return ret;
-       }
- 
-       return splice_from_pipe_feed(pipe, sd, pipe_to_file);
- }
- 
- static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
-                                      struct file *out,
-                                      loff_t *ppos,
-                                      size_t len,
-                                      unsigned int flags)
- {
-       int ret;
-       struct address_space *mapping = out->f_mapping;
-       struct inode *inode = mapping->host;
-       struct splice_desc sd = {
-               .total_len = len,
-               .flags = flags,
-               .pos = *ppos,
-               .u.file = out,
-       };
- 
- 
-       trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
-                       (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                       out->f_path.dentry->d_name.len,
-                       out->f_path.dentry->d_name.name, len);
- 
-       pipe_lock(pipe);
- 
-       splice_from_pipe_begin(&sd);
-       do {
-               ret = splice_from_pipe_next(pipe, &sd);
-               if (ret <= 0)
-                       break;
- 
-               mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
-               ret = ocfs2_rw_lock(inode, 1);
-               if (ret < 0)
-                       mlog_errno(ret);
-               else {
-                       ret = ocfs2_splice_to_file(pipe, out, &sd);
-                       ocfs2_rw_unlock(inode, 1);
-               }
-               mutex_unlock(&inode->i_mutex);
-       } while (ret > 0);
-       splice_from_pipe_end(pipe, &sd);
- 
-       pipe_unlock(pipe);
- 
-       if (sd.num_spliced)
-               ret = sd.num_spliced;
- 
-       if (ret > 0) {
-               int err;
- 
-               err = generic_write_sync(out, *ppos, ret);
-               if (err)
-                       ret = err;
-               else
-                       *ppos += ret;
- 
-               balance_dirty_pages_ratelimited(mapping);
-       }
- 
-       return ret;
- }
- 
   static ssize_t ocfs2_file_splice_read(struct file *in,
                                       loff_t *ppos,
                                       struct pipe_inode_info *pipe,
@@@ -2534,7 -2445,7 +2445,7 @@@
                         in->f_path.dentry->d_name.name, len);
   
         /*
-        * See the comment in ocfs2_file_aio_read()
+        * See the comment in ocfs2_file_read_iter()
          */
         ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level);
         if (ret < 0) {
@@@ -2549,10 -2460,8 +2460,8 @@@ bail
         return ret;
   }
   
- static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
-                                  const struct iovec *iov,
-                                  unsigned long nr_segs,
-                                  loff_t pos)
+ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
+                                  struct iov_iter *to)
   {
         int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
         struct file *filp = iocb->ki_filp;
@@@ -2561,7 -2470,8 +2470,8 @@@
         trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
                         filp->f_path.dentry->d_name.len,
-                       filp->f_path.dentry->d_name.name, nr_segs);
+                       filp->f_path.dentry->d_name.name,
+                       to->nr_segs);   /* GRRRRR */
   
   
         if (!inode) {
@@@ -2606,13 -2516,13 +2516,13 @@@
         }
         ocfs2_inode_unlock(inode, lock_level);
   
-       ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
+       ret = generic_file_read_iter(iocb, to);
         trace_generic_file_aio_read_ret(ret);
   
         /* buffered aio wouldn't have proper lock coverage today */
         BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
   
-       /* see ocfs2_file_aio_write */
+       /* see ocfs2_file_write_iter */
         if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
                 rw_level = -1;
                 have_alloc_sem = 0;
@@@ -2705,14 -2615,14 +2615,14 @@@ const struct inode_operations ocfs2_spe
    */
   const struct file_operations ocfs2_fops = {
         .llseek         = ocfs2_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
         .mmap           = ocfs2_mmap,
         .fsync          = ocfs2_sync_file,
         .release        = ocfs2_file_release,
         .open           = ocfs2_file_open,
-       .aio_read       = ocfs2_file_aio_read,
-       .aio_write      = ocfs2_file_aio_write,
+       .read_iter      = ocfs2_file_read_iter,
+       .write_iter     = ocfs2_file_write_iter,
         .unlocked_ioctl = ocfs2_ioctl,
   #ifdef CONFIG_COMPAT
         .compat_ioctl   = ocfs2_compat_ioctl,
@@@ -2720,7 -2630,7 +2630,7 @@@
         .lock           = ocfs2_lock,
         .flock          = ocfs2_flock,
         .splice_read    = ocfs2_file_splice_read,
-       .splice_write   = ocfs2_file_splice_write,
+       .splice_write   = iter_file_splice_write,
         .fallocate      = ocfs2_fallocate,
   };
   
@@@ -2753,21 -2663,21 +2663,21 @@@ const struct file_operations ocfs2_dop
    */
   const struct file_operations ocfs2_fops_no_plocks = {
         .llseek         = ocfs2_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
         .mmap           = ocfs2_mmap,
         .fsync          = ocfs2_sync_file,
         .release        = ocfs2_file_release,
         .open           = ocfs2_file_open,
-       .aio_read       = ocfs2_file_aio_read,
-       .aio_write      = ocfs2_file_aio_write,
+       .read_iter      = ocfs2_file_read_iter,
+       .write_iter     = ocfs2_file_write_iter,
         .unlocked_ioctl = ocfs2_ioctl,
   #ifdef CONFIG_COMPAT
         .compat_ioctl   = ocfs2_compat_ioctl,
   #endif
         .flock          = ocfs2_flock,
         .splice_read    = ocfs2_file_splice_read,
-       .splice_write   = ocfs2_file_splice_write,
+       .splice_write   = iter_file_splice_write,
         .fallocate      = ocfs2_fallocate,
   };
   
diff --combined fs/reiserfs/file.c

index 5f6c32c668b68816584f19c982c4b9a22ded751b,f070cc827456b68cc3dd3a79bab4009a2c5a2924..db9e80ba53a0db5abe4910fa128bab1e6a2ee6ad
--- 1/fs/reiserfs/file.c
--- 2/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@@ -15,20 -15,20 +15,20 @@@
   #include <linux/quotaops.h>
   
   /*
- -** We pack the tails of files on file close, not at the time they are written.
- -** This implies an unnecessary copy of the tail and an unnecessary indirect item
- -** insertion/balancing, for files that are written in one write.
- -** It avoids unnecessary tail packings (balances) for files that are written in
- -** multiple writes and are small enough to have tails.
- -**
- -** file_release is called by the VFS layer when the file is closed.  If
- -** this is the last open file descriptor, and the file
- -** small enough to have a tail, and the tail is currently in an
- -** unformatted node, the tail is converted back into a direct item.
- -**
- -** We use reiserfs_truncate_file to pack the tail, since it already has
- -** all the conditions coded.
- -*/
+ + * We pack the tails of files on file close, not at the time they are written.
+ + * This implies an unnecessary copy of the tail and an unnecessary indirect item
+ + * insertion/balancing, for files that are written in one write.
+ + * It avoids unnecessary tail packings (balances) for files that are written in
+ + * multiple writes and are small enough to have tails.
+ + *
+ + * file_release is called by the VFS layer when the file is closed.  If
+ + * this is the last open file descriptor, and the file
+ + * small enough to have a tail, and the tail is currently in an
+ + * unformatted node, the tail is converted back into a direct item.
+ + *
+ + * We use reiserfs_truncate_file to pack the tail, since it already has
+ + * all the conditions coded.
+ + */
   static int reiserfs_file_release(struct inode *inode, struct file *filp)
   {
   
@@@ -41,10 -41,10 +41,10 @@@
           if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1))
                 return 0;
   
- -      mutex_lock(&(REISERFS_I(inode)->tailpack));
+ +      mutex_lock(&REISERFS_I(inode)->tailpack);
   
           if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) {
- -              mutex_unlock(&(REISERFS_I(inode)->tailpack));
+ +              mutex_unlock(&REISERFS_I(inode)->tailpack);
                 return 0;
         }
   
@@@ -52,35 -52,31 +52,35 @@@
         if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
              !tail_has_to_be_packed(inode)) &&
             REISERFS_I(inode)->i_prealloc_count <= 0) {
- -              mutex_unlock(&(REISERFS_I(inode)->tailpack));
+ +              mutex_unlock(&REISERFS_I(inode)->tailpack);
                 return 0;
         }
   
         reiserfs_write_lock(inode->i_sb);
- -      /* freeing preallocation only involves relogging blocks that
+ +      /*
+ +       * freeing preallocation only involves relogging blocks that
          * are already in the current transaction.  preallocation gets
          * freed at the end of each transaction, so it is impossible for
          * us to log any additional blocks (including quota blocks)
          */
         err = journal_begin(&th, inode->i_sb, 1);
         if (err) {
- -              /* uh oh, we can't allow the inode to go away while there
+ +              /*
+ +               * uh oh, we can't allow the inode to go away while there
                  * is still preallocation blocks pending.  Try to join the
                  * aborted transaction
                  */
                 jbegin_failure = err;
- -              err = journal_join_abort(&th, inode->i_sb, 1);
+ +              err = journal_join_abort(&th, inode->i_sb);
   
                 if (err) {
- -                      /* hmpf, our choices here aren't good.  We can pin the inode
- -                       * which will disallow unmount from every happening, we can
- -                       * do nothing, which will corrupt random memory on unmount,
- -                       * or we can forcibly remove the file from the preallocation
- -                       * list, which will leak blocks on disk.  Lets pin the inode
+ +                      /*
+ +                       * hmpf, our choices here aren't good.  We can pin
+ +                       * the inode which will disallow unmount from ever
+ +                       * happening, we can do nothing, which will corrupt
+ +                       * random memory on unmount, or we can forcibly
+ +                       * remove the file from the preallocation list, which
+ +                       * will leak blocks on disk.  Lets pin the inode
                          * and let the admin know what is going on.
                          */
                         igrab(inode);
@@@ -96,7 -92,7 +96,7 @@@
   #ifdef REISERFS_PREALLOCATE
         reiserfs_discard_prealloc(&th, inode);
   #endif
- -      err = journal_end(&th, inode->i_sb, 1);
+ +      err = journal_end(&th);
   
         /* copy back the error code from journal_begin */
         if (!err)
@@@ -106,38 -102,35 +106,38 @@@
             (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
             tail_has_to_be_packed(inode)) {
   
- -              /* if regular file is released by last holder and it has been
- -                 appended (we append by unformatted node only) or its direct
- -                 item(s) had to be converted, then it may have to be
- -                 indirect2direct converted */
+ +              /*
+ +               * if regular file is released by last holder and it has been
+ +               * appended (we append by unformatted node only) or its direct
+ +               * item(s) had to be converted, then it may have to be
+ +               * indirect2direct converted
+ +               */
                 err = reiserfs_truncate_file(inode, 0);
         }
- -      out:
+ +out:
         reiserfs_write_unlock(inode->i_sb);
- -      mutex_unlock(&(REISERFS_I(inode)->tailpack));
+ +      mutex_unlock(&REISERFS_I(inode)->tailpack);
         return err;
   }
   
   static int reiserfs_file_open(struct inode *inode, struct file *file)
   {
         int err = dquot_file_open(inode, file);
+ +
+ +      /* somebody might be tailpacking on final close; wait for it */
           if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
- -              /* somebody might be tailpacking on final close; wait for it */
- -              mutex_lock(&(REISERFS_I(inode)->tailpack));
+ +              mutex_lock(&REISERFS_I(inode)->tailpack);
                 atomic_inc(&REISERFS_I(inode)->openers);
- -              mutex_unlock(&(REISERFS_I(inode)->tailpack));
+ +              mutex_unlock(&REISERFS_I(inode)->tailpack);
         }
         return err;
   }
   
   void reiserfs_vfs_truncate_file(struct inode *inode)
   {
- -      mutex_lock(&(REISERFS_I(inode)->tailpack));
+ +      mutex_lock(&REISERFS_I(inode)->tailpack);
         reiserfs_truncate_file(inode, 1);
- -      mutex_unlock(&(REISERFS_I(inode)->tailpack));
+ +      mutex_unlock(&REISERFS_I(inode)->tailpack);
   }
   
   /* Sync a reiserfs file. */
@@@ -212,11 -205,10 +212,11 @@@ int reiserfs_commit_page(struct inode *
                         set_buffer_uptodate(bh);
                         if (logit) {
                                 reiserfs_prepare_for_journal(s, bh, 1);
- -                              journal_mark_dirty(&th, s, bh);
+ +                              journal_mark_dirty(&th, bh);
                         } else if (!buffer_dirty(bh)) {
                                 mark_buffer_dirty(bh);
- -                              /* do data=ordered on any page past the end
+ +                              /*
+ +                               * do data=ordered on any page past the end
                                  * of file and any buffer marked BH_New.
                                  */
                                 if (reiserfs_data_ordered(inode->i_sb) &&
@@@ -227,8 -219,8 +227,8 @@@
                 }
         }
         if (logit) {
- -              ret = journal_end(&th, s, bh_per_page + 1);
- -            drop_write_lock:
+ +              ret = journal_end(&th);
+ +drop_write_lock:
                 reiserfs_write_unlock(s);
         }
         /*
@@@ -243,8 -235,8 +243,8 @@@
   }
   
   const struct file_operations reiserfs_file_operations = {
-       .read = do_sync_read,
-       .write = do_sync_write,
+       .read = new_sync_read,
+       .write = new_sync_write,
         .unlocked_ioctl = reiserfs_ioctl,
   #ifdef CONFIG_COMPAT
         .compat_ioctl = reiserfs_compat_ioctl,
@@@ -253,10 -245,10 +253,10 @@@
         .open = reiserfs_file_open,
         .release = reiserfs_file_release,
         .fsync = reiserfs_sync_file,
-       .aio_read = generic_file_aio_read,
-       .aio_write = generic_file_aio_write,
+       .read_iter = generic_file_read_iter,
+       .write_iter = generic_file_write_iter,
         .splice_read = generic_file_splice_read,
-       .splice_write = generic_file_splice_write,
+       .splice_write = iter_file_splice_write,
         .llseek = generic_file_llseek,
   };
   
diff --combined fs/reiserfs/inode.c

index e3ca04894919c4d0a38f2623676d7ffe1ce6aff3,b8003e8dd1f47bf726d78a1f1a40aba7a56ecc30..63b2b0ec49e6afacd955abf9f172751768ee08ee
--- 1/fs/reiserfs/inode.c
--- 2/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@@ -25,10 -25,7 +25,10 @@@ int reiserfs_commit_write(struct file *
   
   void reiserfs_evict_inode(struct inode *inode)
   {
- -      /* We need blocks for transaction + (user+group) quota update (possibly delete) */
+ +      /*
+ +       * We need blocks for transaction + (user+group) quota
+ +       * update (possibly delete)
+ +       */
         int jbegin_count =
             JOURNAL_PER_BALANCE_CNT * 2 +
             2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
@@@ -42,12 -39,8 +42,12 @@@
         if (inode->i_nlink)
                 goto no_delete;
   
- -      /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
- -      if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {  /* also handles bad_inode case */
+ +      /*
+ +       * The = 0 happens when we abort creating a new inode
+ +       * for some reason like lack of space..
+ +       * also handles bad_inode case
+ +       */
+ +      if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {
   
                 reiserfs_delete_xattrs(inode);
   
@@@ -61,43 -54,34 +61,43 @@@
   
                 err = reiserfs_delete_object(&th, inode);
   
- -              /* Do quota update inside a transaction for journaled quotas. We must do that
- -               * after delete_object so that quota updates go into the same transaction as
- -               * stat data deletion */
+ +              /*
+ +               * Do quota update inside a transaction for journaled quotas.
+ +               * We must do that after delete_object so that quota updates
+ +               * go into the same transaction as stat data deletion
+ +               */
                 if (!err) {
                         int depth = reiserfs_write_unlock_nested(inode->i_sb);
                         dquot_free_inode(inode);
                         reiserfs_write_lock_nested(inode->i_sb, depth);
                 }
   
- -              if (journal_end(&th, inode->i_sb, jbegin_count))
+ +              if (journal_end(&th))
                         goto out;
   
- -              /* check return value from reiserfs_delete_object after
+ +              /*
+ +               * check return value from reiserfs_delete_object after
                  * ending the transaction
                  */
                 if (err)
                     goto out;
   
- -              /* all items of file are deleted, so we can remove "save" link */
- -              remove_save_link(inode, 0 /* not truncate */ ); /* we can't do anything
- -                                                               * about an error here */
+ +              /*
+ +               * all items of file are deleted, so we can remove
+ +               * "save" link
+ +               * we can't do anything about an error here
+ +               */
+ +              remove_save_link(inode, 0 /* not truncate */);
   out:
                 reiserfs_write_unlock(inode->i_sb);
         } else {
                 /* no object items are in the tree */
                 ;
         }
- -      clear_inode(inode);     /* note this must go after the journal_end to prevent deadlock */
+ +
+ +      /* note this must go after the journal_end to prevent deadlock */
+ +      clear_inode(inode);
+ +
         dquot_drop(inode);
         inode->i_blocks = 0;
         return;
@@@ -119,10 -103,8 +119,10 @@@ static void _make_cpu_key(struct cpu_ke
         key->key_length = length;
   }
   
- -/* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
- -   offset and type of key */
+ +/*
+ + * take base of inode_key (it comes from inode always) (dirid, objectid)
+ + * and version from an inode, set offset and type of key
+ + */
   void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
                   int type, int length)
   {
@@@ -132,7 -114,9 +132,7 @@@
                       length);
   }
   
- -//
- -// when key is 0, do not set version and short key
- -//
+ +/* when key is 0, do not set version and short key */
   inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
                               int version,
                               loff_t offset, int type, int length,
@@@ -148,47 -132,43 +148,47 @@@
         set_le_ih_k_type(ih, type);
         put_ih_item_len(ih, length);
         /*    set_ih_free_space (ih, 0); */
- -      // for directory items it is entry count, for directs and stat
- -      // datas - 0xffff, for indirects - 0
+ +      /*
+ +       * for directory items it is entry count, for directs and stat
+ +       * datas - 0xffff, for indirects - 0
+ +       */
         put_ih_entry_count(ih, entry_count);
   }
   
- -//
- -// FIXME: we might cache recently accessed indirect item
- -
- -// Ugh.  Not too eager for that....
- -//  I cut the code until such time as I see a convincing argument (benchmark).
- -// I don't want a bloated inode struct..., and I don't like code complexity....
- -
- -/* cutting the code is fine, since it really isn't in use yet and is easy
- -** to add back in.  But, Vladimir has a really good idea here.  Think
- -** about what happens for reading a file.  For each page,
- -** The VFS layer calls reiserfs_readpage, who searches the tree to find
- -** an indirect item.  This indirect item has X number of pointers, where
- -** X is a big number if we've done the block allocation right.  But,
- -** we only use one or two of these pointers during each call to readpage,
- -** needlessly researching again later on.
- -**
- -** The size of the cache could be dynamic based on the size of the file.
- -**
- -** I'd also like to see us cache the location the stat data item, since
- -** we are needlessly researching for that frequently.
- -**
- -** --chris
- -*/
+ +/*
+ + * FIXME: we might cache recently accessed indirect item
+ + * Ugh.  Not too eager for that....
+ + * I cut the code until such time as I see a convincing argument (benchmark).
+ + * I don't want a bloated inode struct..., and I don't like code complexity....
+ + */
   
- -/* If this page has a file tail in it, and
- -** it was read in by get_block_create_0, the page data is valid,
- -** but tail is still sitting in a direct item, and we can't write to
- -** it.  So, look through this page, and check all the mapped buffers
- -** to make sure they have valid block numbers.  Any that don't need
- -** to be unmapped, so that __block_write_begin will correctly call
- -** reiserfs_get_block to convert the tail into an unformatted node
- -*/
+ +/*
+ + * cutting the code is fine, since it really isn't in use yet and is easy
+ + * to add back in.  But, Vladimir has a really good idea here.  Think
+ + * about what happens for reading a file.  For each page,
+ + * The VFS layer calls reiserfs_readpage, who searches the tree to find
+ + * an indirect item.  This indirect item has X number of pointers, where
+ + * X is a big number if we've done the block allocation right.  But,
+ + * we only use one or two of these pointers during each call to readpage,
+ + * needlessly researching again later on.
+ + *
+ + * The size of the cache could be dynamic based on the size of the file.
+ + *
+ + * I'd also like to see us cache the location the stat data item, since
+ + * we are needlessly researching for that frequently.
+ + *
+ + * --chris
+ + */
+ +
+ +/*
+ + * If this page has a file tail in it, and
+ + * it was read in by get_block_create_0, the page data is valid,
+ + * but tail is still sitting in a direct item, and we can't write to
+ + * it.  So, look through this page, and check all the mapped buffers
+ + * to make sure they have valid block numbers.  Any that don't need
+ + * to be unmapped, so that __block_write_begin will correctly call
+ + * reiserfs_get_block to convert the tail into an unformatted node
+ + */
   static inline void fix_tail_page_for_writing(struct page *page)
   {
         struct buffer_head *head, *next, *bh;
@@@ -206,10 -186,8 +206,10 @@@
         }
   }
   
- -/* reiserfs_get_block does not need to allocate a block only if it has been
- -   done already or non-hole position has been found in the indirect item */
+ +/*
+ + * reiserfs_get_block does not need to allocate a block only if it has been
+ + * done already or non-hole position has been found in the indirect item
+ + */
   static inline int allocation_needed(int retval, b_blocknr_t allocated,
                                     struct item_head *ih,
                                     __le32 * item, int pos_in_item)
@@@ -233,16 -211,14 +233,16 @@@ static inline void set_block_dev_mapped
         map_bh(bh, inode->i_sb, block);
   }
   
- -//
- -// files which were created in the earlier version can not be longer,
- -// than 2 gb
- -//
+ +/*
+ + * files which were created in the earlier version can not be longer,
+ + * than 2 gb
+ + */
   static int file_capable(struct inode *inode, sector_t block)
   {
- -      if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||      // it is new file.
- -          block < (1 << (31 - inode->i_sb->s_blocksize_bits)))        // old file, but 'block' is inside of 2gb
+ +      /* it is new file. */
+ +      if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||
+ +          /* old file, but 'block' is inside of 2gb */
+ +          block < (1 << (31 - inode->i_sb->s_blocksize_bits)))
                 return 1;
   
         return 0;
@@@ -252,6 -228,7 +252,6 @@@ static int restart_transaction(struct r
                                struct inode *inode, struct treepath *path)
   {
         struct super_block *s = th->t_super;
- -      int len = th->t_blocks_allocated;
         int err;
   
         BUG_ON(!th->t_trans_id);
@@@ -264,7 -241,7 +264,7 @@@
                 return 0;
         }
         reiserfs_update_sd(th, inode);
- -      err = journal_end(th, s, len);
+ +      err = journal_end(th);
         if (!err) {
                 err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
                 if (!err)
@@@ -273,14 -250,14 +273,14 @@@
         return err;
   }
   
- -// it is called by get_block when create == 0. Returns block number
- -// for 'block'-th logical block of file. When it hits direct item it
- -// returns 0 (being called from bmap) or read direct item into piece
- -// of page (bh_result)
- -
- -// Please improve the english/clarity in the comment above, as it is
- -// hard to understand.
- -
+ +/*
+ + * it is called by get_block when create == 0. Returns block number
+ + * for 'block'-th logical block of file. When it hits direct item it
+ + * returns 0 (being called from bmap) or read direct item into piece
+ + * of page (bh_result)
+ + * Please improve the english/clarity in the comment above, as it is
+ + * hard to understand.
+ + */
   static int _get_block_create_0(struct inode *inode, sector_t block,
                                struct buffer_head *bh_result, int args)
   {
@@@ -296,7 -273,7 +296,7 @@@
         int done = 0;
         unsigned long offset;
   
- -      // prepare the key to look for the 'block'-th block of file
+ +      /* prepare the key to look for the 'block'-th block of file */
         make_cpu_key(&key, inode,
                      (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
                      3);
@@@ -308,28 -285,23 +308,28 @@@
                         kunmap(bh_result->b_page);
                 if (result == IO_ERROR)
                         return -EIO;
- -              // We do not return -ENOENT if there is a hole but page is uptodate, because it means
- -              // That there is some MMAPED data associated with it that is yet to be written to disk.
+ +              /*
+ +               * We do not return -ENOENT if there is a hole but page is
+ +               * uptodate, because it means that there is some MMAPED data
+ +               * associated with it that is yet to be written to disk.
+ +               */
                 if ((args & GET_BLOCK_NO_HOLE)
                     && !PageUptodate(bh_result->b_page)) {
                         return -ENOENT;
                 }
                 return 0;
         }
- -      //
+ +
         bh = get_last_bh(&path);
- -      ih = get_ih(&path);
+ +      ih = tp_item_head(&path);
         if (is_indirect_le_ih(ih)) {
- -              __le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih);
+ +              __le32 *ind_item = (__le32 *) ih_item_body(bh, ih);
   
- -              /* FIXME: here we could cache indirect item or part of it in
- -                 the inode to avoid search_by_key in case of subsequent
- -                 access to file */
+ +              /*
+ +               * FIXME: here we could cache indirect item or part of it in
+ +               * the inode to avoid search_by_key in case of subsequent
+ +               * access to file
+ +               */
                 blocknr = get_block_num(ind_item, path.pos_in_item);
                 ret = 0;
                 if (blocknr) {
@@@ -339,12 -311,8 +339,12 @@@
                                 set_buffer_boundary(bh_result);
                         }
                 } else
- -                      // We do not return -ENOENT if there is a hole but page is uptodate, because it means
- -                      // That there is some MMAPED data associated with it that is yet to  be written to disk.
+ +                      /*
+ +                       * We do not return -ENOENT if there is a hole but
+ +                       * page is uptodate, because it means that there is
+ +                       * some MMAPED data associated with it that is
+ +                       * yet to be written to disk.
+ +                       */
                 if ((args & GET_BLOCK_NO_HOLE)
                             && !PageUptodate(bh_result->b_page)) {
                         ret = -ENOENT;
@@@ -355,45 -323,41 +355,45 @@@
                         kunmap(bh_result->b_page);
                 return ret;
         }
- -      // requested data are in direct item(s)
+ +      /* requested data are in direct item(s) */
         if (!(args & GET_BLOCK_READ_DIRECT)) {
- -              // we are called by bmap. FIXME: we can not map block of file
- -              // when it is stored in direct item(s)
+ +              /*
+ +               * we are called by bmap. FIXME: we can not map block of file
+ +               * when it is stored in direct item(s)
+ +               */
                 pathrelse(&path);
                 if (p)
                         kunmap(bh_result->b_page);
                 return -ENOENT;
         }
   
- -      /* if we've got a direct item, and the buffer or page was uptodate,
- -       ** we don't want to pull data off disk again.  skip to the
- -       ** end, where we map the buffer and return
+ +      /*
+ +       * if we've got a direct item, and the buffer or page was uptodate,
+ +       * we don't want to pull data off disk again.  skip to the
+ +       * end, where we map the buffer and return
          */
         if (buffer_uptodate(bh_result)) {
                 goto finished;
         } else
                 /*
- -               ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
- -               ** pages without any buffers.  If the page is up to date, we don't want
- -               ** read old data off disk.  Set the up to date bit on the buffer instead
- -               ** and jump to the end
+ +               * grab_tail_page can trigger calls to reiserfs_get_block on
+ +               * up to date pages without any buffers.  If the page is up
+ +               * to date, we don't want read old data off disk.  Set the up
+ +               * to date bit on the buffer instead and jump to the end
                  */
         if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
                 set_buffer_uptodate(bh_result);
                 goto finished;
         }
- -      // read file tail into part of page
+ +      /* read file tail into part of page */
         offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
         copy_item_head(&tmp_ih, ih);
   
- -      /* we only want to kmap if we are reading the tail into the page.
- -       ** this is not the common case, so we don't kmap until we are
- -       ** sure we need to.  But, this means the item might move if
- -       ** kmap schedules
+ +      /*
+ +       * we only want to kmap if we are reading the tail into the page.
+ +       * this is not the common case, so we don't kmap until we are
+ +       * sure we need to.  But, this means the item might move if
+ +       * kmap schedules
          */
         if (!p)
                 p = (char *)kmap(bh_result->b_page);
@@@ -404,11 -368,10 +404,11 @@@
                 if (!is_direct_le_ih(ih)) {
                         BUG();
                 }
- -              /* make sure we don't read more bytes than actually exist in
- -               ** the file.  This can happen in odd cases where i_size isn't
- -               ** correct, and when direct item padding results in a few
- -               ** extra bytes at the end of the direct item
+ +              /*
+ +               * make sure we don't read more bytes than actually exist in
+ +               * the file.  This can happen in odd cases where i_size isn't
+ +               * correct, and when direct item padding results in a few
+ +               * extra bytes at the end of the direct item
                  */
                 if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
                         break;
@@@ -420,43 -383,40 +420,43 @@@
                 } else {
                         chars = ih_item_len(ih) - path.pos_in_item;
                 }
- -              memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars);
+ +              memcpy(p, ih_item_body(bh, ih) + path.pos_in_item, chars);
   
                 if (done)
                         break;
   
                 p += chars;
   
+ +              /*
+ +               * we done, if read direct item is not the last item of
+ +               * node FIXME: we could try to check right delimiting key
+ +               * to see whether direct item continues in the right
+ +               * neighbor or rely on i_size
+ +               */
                 if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
- -                      // we done, if read direct item is not the last item of
- -                      // node FIXME: we could try to check right delimiting key
- -                      // to see whether direct item continues in the right
- -                      // neighbor or rely on i_size
                         break;
   
- -              // update key to look for the next piece
+ +              /* update key to look for the next piece */
                 set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
                 result = search_for_position_by_key(inode->i_sb, &key, &path);
                 if (result != POSITION_FOUND)
- -                      // i/o error most likely
+ +                      /* i/o error most likely */
                         break;
                 bh = get_last_bh(&path);
- -              ih = get_ih(&path);
+ +              ih = tp_item_head(&path);
         } while (1);
   
         flush_dcache_page(bh_result->b_page);
         kunmap(bh_result->b_page);
   
- -      finished:
+ +finished:
         pathrelse(&path);
   
         if (result == IO_ERROR)
                 return -EIO;
   
- -      /* this buffer has valid data, but isn't valid for io.  mapping it to
+ +      /*
+ +       * this buffer has valid data, but isn't valid for io.  mapping it to
          * block #0 tells the rest of reiserfs it just has a tail in it
          */
         map_bh(bh_result, inode->i_sb, 0);
@@@ -464,10 -424,8 +464,10 @@@
         return 0;
   }
   
- -// this is called to create file map. So, _get_block_create_0 will not
- -// read direct item
+ +/*
+ + * this is called to create file map. So, _get_block_create_0 will not
+ + * read direct item
+ + */
   static int reiserfs_bmap(struct inode *inode, sector_t block,
                          struct buffer_head *bh_result, int create)
   {
@@@ -481,23 -439,22 +481,23 @@@
         return 0;
   }
   
- -/* special version of get_block that is only used by grab_tail_page right
- -** now.  It is sent to __block_write_begin, and when you try to get a
- -** block past the end of the file (or a block from a hole) it returns
- -** -ENOENT instead of a valid buffer.  __block_write_begin expects to
- -** be able to do i/o on the buffers returned, unless an error value
- -** is also returned.
- -**
- -** So, this allows __block_write_begin to be used for reading a single block
- -** in a page.  Where it does not produce a valid page for holes, or past the
- -** end of the file.  This turns out to be exactly what we need for reading
- -** tails for conversion.
- -**
- -** The point of the wrapper is forcing a certain value for create, even
- -** though the VFS layer is calling this function with create==1.  If you
- -** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
- -** don't use this function.
+ +/*
+ + * special version of get_block that is only used by grab_tail_page right
+ + * now.  It is sent to __block_write_begin, and when you try to get a
+ + * block past the end of the file (or a block from a hole) it returns
+ + * -ENOENT instead of a valid buffer.  __block_write_begin expects to
+ + * be able to do i/o on the buffers returned, unless an error value
+ + * is also returned.
+ + *
+ + * So, this allows __block_write_begin to be used for reading a single block
+ + * in a page.  Where it does not produce a valid page for holes, or past the
+ + * end of the file.  This turns out to be exactly what we need for reading
+ + * tails for conversion.
+ + *
+ + * The point of the wrapper is forcing a certain value for create, even
+ + * though the VFS layer is calling this function with create==1.  If you
+ + * don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
+ + * don't use this function.
   */
   static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
                                        struct buffer_head *bh_result,
@@@ -506,10 -463,8 +506,10 @@@
         return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
   }
   
- -/* This is special helper for reiserfs_get_block in case we are executing
- -   direct_IO request. */
+ +/*
+ + * This is special helper for reiserfs_get_block in case we are executing
+ + * direct_IO request.
+ + */
   static int reiserfs_get_blocks_direct_io(struct inode *inode,
                                          sector_t iblock,
                                          struct buffer_head *bh_result,
@@@ -519,11 -474,9 +519,11 @@@
   
         bh_result->b_page = NULL;
   
- -      /* We set the b_size before reiserfs_get_block call since it is
- -         referenced in convert_tail_for_hole() that may be called from
- -         reiserfs_get_block() */
+ +      /*
+ +       * We set the b_size before reiserfs_get_block call since it is
+ +       * referenced in convert_tail_for_hole() that may be called from
+ +       * reiserfs_get_block()
+ +       */
         bh_result->b_size = (1 << inode->i_blkbits);
   
         ret = reiserfs_get_block(inode, iblock, bh_result,
@@@ -533,18 -486,14 +533,18 @@@
   
         /* don't allow direct io onto tail pages */
         if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
- -              /* make sure future calls to the direct io funcs for this offset
- -               ** in the file fail by unmapping the buffer
+ +              /*
+ +               * make sure future calls to the direct io funcs for this
+ +               * offset in the file fail by unmapping the buffer
                  */
                 clear_buffer_mapped(bh_result);
                 ret = -EINVAL;
         }
- -      /* Possible unpacked tail. Flush the data before pages have
- -         disappeared */
+ +
+ +      /*
+ +       * Possible unpacked tail. Flush the data before pages have
+ +       * disappeared
+ +       */
         if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
                 int err;
   
@@@ -558,20 -507,20 +558,20 @@@
                 if (err < 0)
                         ret = err;
         }
- -      out:
+ +out:
         return ret;
   }
   
   /*
- -** helper function for when reiserfs_get_block is called for a hole
- -** but the file tail is still in a direct item
- -** bh_result is the buffer head for the hole
- -** tail_offset is the offset of the start of the tail in the file
- -**
- -** This calls prepare_write, which will start a new transaction
- -** you should not be in a transaction, or have any paths held when you
- -** call this.
- -*/
+ + * helper function for when reiserfs_get_block is called for a hole
+ + * but the file tail is still in a direct item
+ + * bh_result is the buffer head for the hole
+ + * tail_offset is the offset of the start of the tail in the file
+ + *
+ + * This calls prepare_write, which will start a new transaction
+ + * you should not be in a transaction, or have any paths held when you
+ + * call this.
+ + */
   static int convert_tail_for_hole(struct inode *inode,
                                  struct buffer_head *bh_result,
                                  loff_t tail_offset)
@@@ -591,10 -540,9 +591,10 @@@
         tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
   
         index = tail_offset >> PAGE_CACHE_SHIFT;
- -      /* hole_page can be zero in case of direct_io, we are sure
- -         that we cannot get here if we write with O_DIRECT into
- -         tail page */
+ +      /*
+ +       * hole_page can be zero in case of direct_io, we are sure
+ +       * that we cannot get here if we write with O_DIRECT into tail page
+ +       */
         if (!hole_page || index != hole_page->index) {
                 tail_page = grab_cache_page(inode->i_mapping, index);
                 retval = -ENOMEM;
@@@ -605,15 -553,14 +605,15 @@@
                 tail_page = hole_page;
         }
   
- -      /* we don't have to make sure the conversion did not happen while
- -       ** we were locking the page because anyone that could convert
- -       ** must first take i_mutex.
- -       **
- -       ** We must fix the tail page for writing because it might have buffers
- -       ** that are mapped, but have a block number of 0.  This indicates tail
- -       ** data that has been read directly into the page, and
- -       ** __block_write_begin won't trigger a get_block in this case.
+ +      /*
+ +       * we don't have to make sure the conversion did not happen while
+ +       * we were locking the page because anyone that could convert
+ +       * must first take i_mutex.
+ +       *
+ +       * We must fix the tail page for writing because it might have buffers
+ +       * that are mapped, but have a block number of 0.  This indicates tail
+ +       * data that has been read directly into the page, and
+ +       * __block_write_begin won't trigger a get_block in this case.
          */
         fix_tail_page_for_writing(tail_page);
         retval = __reiserfs_write_begin(tail_page, tail_start,
@@@ -626,12 -573,12 +626,12 @@@
   
         retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
   
- -      unlock:
+ +unlock:
         if (tail_page != hole_page) {
                 unlock_page(tail_page);
                 page_cache_release(tail_page);
         }
- -      out:
+ +out:
         return retval;
   }
   
@@@ -657,8 -604,7 +657,8 @@@ int reiserfs_get_block(struct inode *in
                        struct buffer_head *bh_result, int create)
   {
         int repeat, retval = 0;
- -      b_blocknr_t allocated_block_nr = 0;     // b_blocknr_t is (unsigned) 32 bit int
+ +      /* b_blocknr_t is (unsigned) 32 bit int*/
+ +      b_blocknr_t allocated_block_nr = 0;
         INITIALIZE_PATH(path);
         int pos_in_item;
         struct cpu_key key;
@@@ -668,14 -614,12 +668,14 @@@
         int done;
         int fs_gen;
         struct reiserfs_transaction_handle *th = NULL;
- -      /* space reserved in transaction batch:
- -         . 3 balancings in direct->indirect conversion
- -         . 1 block involved into reiserfs_update_sd()
- -         XXX in practically impossible worst case direct2indirect()
- -         can incur (much) more than 3 balancings.
- -         quota update for user, group */
+ +      /*
+ +       * space reserved in transaction batch:
+ +       * . 3 balancings in direct->indirect conversion
+ +       * . 1 block involved into reiserfs_update_sd()
+ +       * XXX in practically impossible worst case direct2indirect()
+ +       * can incur (much) more than 3 balancings.
+ +       * quota update for user, group
+ +       */
         int jbegin_count =
             JOURNAL_PER_BALANCE_CNT * 3 + 1 +
             2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
@@@ -692,9 -636,8 +692,9 @@@
                 return -EFBIG;
         }
   
- -      /* if !create, we aren't changing the FS, so we don't need to
- -       ** log anything, so we don't need to start a transaction
+ +      /*
+ +       * if !create, we aren't changing the FS, so we don't need to
+ +       * log anything, so we don't need to start a transaction
          */
         if (!(create & GET_BLOCK_CREATE)) {
                 int ret;
@@@ -704,7 -647,6 +704,7 @@@
                 reiserfs_write_unlock(inode->i_sb);
                 return ret;
         }
+ +
         /*
          * if we're already in a transaction, make sure to close
          * any new transactions we start in this func
@@@ -713,10 -655,8 +713,10 @@@
             reiserfs_transaction_running(inode->i_sb))
                 dangle = 0;
   
- -      /* If file is of such a size, that it might have a tail and tails are enabled
- -       ** we should mark it as possibly needing tail packing on close
+ +      /*
+ +       * If file is of such a size, that it might have a tail and
+ +       * tails are enabled  we should mark it as possibly needing
+ +       * tail packing on close
          */
         if ((have_large_tails(inode->i_sb)
              && inode->i_size < i_block_size(inode) * 4)
@@@ -727,7 -667,7 +727,7 @@@
         /* set the key of the first byte in the 'block'-th block of file */
         make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
         if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
- -            start_trans:
+ +start_trans:
                 th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
                 if (!th) {
                         retval = -ENOMEM;
@@@ -735,7 -675,7 +735,7 @@@
                 }
                 reiserfs_update_inode_transaction(inode);
         }
- -      research:
+ +research:
   
         retval = search_for_position_by_key(inode->i_sb, &key, &path);
         if (retval == IO_ERROR) {
@@@ -744,8 -684,8 +744,8 @@@
         }
   
         bh = get_last_bh(&path);
- -      ih = get_ih(&path);
- -      item = get_item(&path);
+ +      ih = tp_item_head(&path);
+ +      item = tp_item_body(&path);
         pos_in_item = path.pos_in_item;
   
         fs_gen = get_generation(inode->i_sb);
@@@ -763,12 -703,11 +763,12 @@@
                     _allocate_block(th, block, inode, &allocated_block_nr,
                                     &path, create);
   
+ +              /*
+ +               * restart the transaction to give the journal a chance to free
+ +               * some blocks.  releases the path, so we have to go back to
+ +               * research if we succeed on the second try
+ +               */
                 if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
- -                      /* restart the transaction to give the journal a chance to free
- -                       ** some blocks.  releases the path, so we have to go back to
- -                       ** research if we succeed on the second try
- -                       */
                         SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
                         retval = restart_transaction(th, inode, &path);
                         if (retval)
@@@ -795,11 -734,9 +795,11 @@@
   
         if (indirect_item_found(retval, ih)) {
                 b_blocknr_t unfm_ptr;
- -              /* 'block'-th block is in the file already (there is
- -                 corresponding cell in some indirect item). But it may be
- -                 zero unformatted node pointer (hole) */
+ +              /*
+ +               * 'block'-th block is in the file already (there is
+ +               * corresponding cell in some indirect item). But it may be
+ +               * zero unformatted node pointer (hole)
+ +               */
                 unfm_ptr = get_block_num(item, pos_in_item);
                 if (unfm_ptr == 0) {
                         /* use allocated block to plug the hole */
@@@ -816,7 -753,7 +816,7 @@@
                                 reiserfs_add_ordered_list(inode, bh_result);
                         put_block_num(item, pos_in_item, allocated_block_nr);
                         unfm_ptr = allocated_block_nr;
- -                      journal_mark_dirty(th, inode->i_sb, bh);
+ +                      journal_mark_dirty(th, bh);
                         reiserfs_update_sd(th, inode);
                 }
                 set_block_dev_mapped(bh_result, unfm_ptr, inode);
@@@ -827,10 -764,9 +827,10 @@@
   
                 reiserfs_write_unlock(inode->i_sb);
   
- -              /* the item was found, so new blocks were not added to the file
- -               ** there is no need to make sure the inode is updated with this
- -               ** transaction
+ +              /*
+ +               * the item was found, so new blocks were not added to the file
+ +               * there is no need to make sure the inode is updated with this
+ +               * transaction
                  */
                 return retval;
         }
@@@ -840,11 -776,9 +840,11 @@@
                 goto start_trans;
         }
   
- -      /* desired position is not found or is in the direct item. We have
- -         to append file with holes up to 'block'-th block converting
- -         direct items to indirect one if necessary */
+ +      /*
+ +       * desired position is not found or is in the direct item. We have
+ +       * to append file with holes up to 'block'-th block converting
+ +       * direct items to indirect one if necessary
+ +       */
         done = 0;
         do {
                 if (is_statdata_le_ih(ih)) {
@@@ -856,18 -790,16 +856,18 @@@
                                           TYPE_INDIRECT, UNFM_P_SIZE,
                                           0 /* free_space */ );
   
+ +                      /*
+ +                       * we are going to add 'block'-th block to the file.
+ +                       * Use allocated block for that
+ +                       */
                         if (cpu_key_k_offset(&key) == 1) {
- -                              /* we are going to add 'block'-th block to the file. Use
- -                                 allocated block for that */
                                 unp = cpu_to_le32(allocated_block_nr);
                                 set_block_dev_mapped(bh_result,
                                                      allocated_block_nr, inode);
                                 set_buffer_new(bh_result);
                                 done = 1;
                         }
- -                      tmp_key = key;  // ;)
+ +                      tmp_key = key;  /* ;) */
                         set_cpu_key_k_offset(&tmp_key, 1);
                         PATH_LAST_POSITION(&path)++;
   
@@@ -877,12 -809,9 +877,12 @@@
                         if (retval) {
                                 reiserfs_free_block(th, inode,
                                                     allocated_block_nr, 1);
- -                              goto failure;   // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
+ +                              /*
+ +                               * retval == -ENOSPC, -EDQUOT or -EIO
+ +                               * or -EEXIST
+ +                               */
+ +                              goto failure;
                         }
- -                      //mark_tail_converted (inode);
                 } else if (is_direct_le_ih(ih)) {
                         /* direct item has to be converted */
                         loff_t tail_offset;
@@@ -890,24 -819,18 +890,24 @@@
                         tail_offset =
                             ((le_ih_k_offset(ih) -
                               1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
+ +
+ +                      /*
+ +                       * direct item we just found fits into block we have
+ +                       * to map. Convert it into unformatted node: use
+ +                       * bh_result for the conversion
+ +                       */
                         if (tail_offset == cpu_key_k_offset(&key)) {
- -                              /* direct item we just found fits into block we have
- -                                 to map. Convert it into unformatted node: use
- -                                 bh_result for the conversion */
                                 set_block_dev_mapped(bh_result,
                                                      allocated_block_nr, inode);
                                 unbh = bh_result;
                                 done = 1;
                         } else {
- -                              /* we have to padd file tail stored in direct item(s)
- -                                 up to block size and convert it to unformatted
- -                                 node. FIXME: this should also get into page cache */
+ +                              /*
+ +                               * we have to pad file tail stored in direct
+ +                               * item(s) up to block size and convert it
+ +                               * to unformatted node. FIXME: this should
+ +                               * also get into page cache
+ +                               */
   
                                 pathrelse(&path);
                                 /*
@@@ -936,10 -859,7 +936,10 @@@
                                                         inode->i_ino,
                                                         retval);
                                         if (allocated_block_nr) {
- -                                              /* the bitmap, the super, and the stat data == 3 */
+ +                                              /*
+ +                                               * the bitmap, the super,
+ +                                               * and the stat data == 3
+ +                                               */
                                                 if (!th)
                                                         th = reiserfs_persistent_transaction(inode->i_sb, 3);
                                                 if (th)
@@@ -961,57 -881,43 +961,57 @@@
                                                     allocated_block_nr, 1);
                                 goto failure;
                         }
- -                      /* it is important the set_buffer_uptodate is done after
- -                       ** the direct2indirect.  The buffer might contain valid
- -                       ** data newer than the data on disk (read by readpage, changed,
- -                       ** and then sent here by writepage).  direct2indirect needs
- -                       ** to know if unbh was already up to date, so it can decide
- -                       ** if the data in unbh needs to be replaced with data from
- -                       ** the disk
+ +                      /*
+ +                       * it is important the set_buffer_uptodate is done
+ +                       * after the direct2indirect.  The buffer might
+ +                       * contain valid data newer than the data on disk
+ +                       * (read by readpage, changed, and then sent here by
+ +                       * writepage).  direct2indirect needs to know if unbh
+ +                       * was already up to date, so it can decide if the
+ +                       * data in unbh needs to be replaced with data from
+ +                       * the disk
                          */
                         set_buffer_uptodate(unbh);
   
- -                      /* unbh->b_page == NULL in case of DIRECT_IO request, this means
- -                         buffer will disappear shortly, so it should not be added to
+ +                      /*
+ +                       * unbh->b_page == NULL in case of DIRECT_IO request,
+ +                       * this means buffer will disappear shortly, so it
+ +                       * should not be added to
                          */
                         if (unbh->b_page) {
- -                              /* we've converted the tail, so we must
- -                               ** flush unbh before the transaction commits
+ +                              /*
+ +                               * we've converted the tail, so we must
+ +                               * flush unbh before the transaction commits
                                  */
                                 reiserfs_add_tail_list(inode, unbh);
   
- -                              /* mark it dirty now to prevent commit_write from adding
- -                               ** this buffer to the inode's dirty buffer list
+ +                              /*
+ +                               * mark it dirty now to prevent commit_write
+ +                               * from adding this buffer to the inode's
+ +                               * dirty buffer list
                                  */
                                 /*
- -                               * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
- -                               * It's still atomic, but it sets the page dirty too,
- -                               * which makes it eligible for writeback at any time by the
- -                               * VM (which was also the case with __mark_buffer_dirty())
+ +                               * AKPM: changed __mark_buffer_dirty to
+ +                               * mark_buffer_dirty().  It's still atomic,
+ +                               * but it sets the page dirty too, which makes
+ +                               * it eligible for writeback at any time by the
+ +                               * VM (which was also the case with
+ +                               * __mark_buffer_dirty())
                                  */
                                 mark_buffer_dirty(unbh);
                         }
                 } else {
- -                      /* append indirect item with holes if needed, when appending
- -                         pointer to 'block'-th block use block, which is already
- -                         allocated */
+ +                      /*
+ +                       * append indirect item with holes if needed, when
+ +                       * appending pointer to 'block'-th block use block,
+ +                       * which is already allocated
+ +                       */
                         struct cpu_key tmp_key;
- -                      unp_t unf_single = 0;   // We use this in case we need to allocate only
- -                      // one block which is a fastpath
+ +                      /*
+ +                       * We use this in case we need to allocate
+ +                       * only one block which is a fastpath
+ +                       */
+ +                      unp_t unf_single = 0;
                         unp_t *un;
                         __u64 max_to_insert =
                             MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
@@@ -1020,17 -926,14 +1020,17 @@@
   
                         RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
                                "vs-804: invalid position for append");
- -                      /* indirect item has to be appended, set up key of that position */
+ +                      /*
+ +                       * indirect item has to be appended,
+ +                       * set up key of that position
+ +                       * (key type is unimportant)
+ +                       */
                         make_cpu_key(&tmp_key, inode,
                                      le_key_k_offset(version,
- -                                                   &(ih->ih_key)) +
+ +                                                   &ih->ih_key) +
                                      op_bytes_number(ih,
                                                      inode->i_sb->s_blocksize),
- -                                   //pos_in_item * inode->i_sb->s_blocksize,
- -                                   TYPE_INDIRECT, 3); // key type is unimportant
+ +                                   TYPE_INDIRECT, 3);
   
                         RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
                                "green-805: invalid offset");
@@@ -1051,10 -954,8 +1051,10 @@@
                                 }
                         }
                         if (blocks_needed <= max_to_insert) {
- -                              /* we are going to add target block to the file. Use allocated
- -                                 block for that */
+ +                              /*
+ +                               * we are going to add target block to
+ +                               * the file. Use allocated block for that
+ +                               */
                                 un[blocks_needed - 1] =
                                     cpu_to_le32(allocated_block_nr);
                                 set_block_dev_mapped(bh_result,
@@@ -1063,11 -964,8 +1063,11 @@@
                                 done = 1;
                         } else {
                                 /* paste hole to the indirect item */
- -                              /* If kmalloc failed, max_to_insert becomes zero and it means we
- -                                 only have space for one block */
+ +                              /*
+ +                               * If kmalloc failed, max_to_insert becomes
+ +                               * zero and it means we only have space for
+ +                               * one block
+ +                               */
                                 blocks_needed =
                                     max_to_insert ? max_to_insert : 1;
                         }
@@@ -1086,12 -984,9 +1086,12 @@@
                                 goto failure;
                         }
                         if (!done) {
- -                              /* We need to mark new file size in case this function will be
- -                                 interrupted/aborted later on. And we may do this only for
- -                                 holes. */
+ +                              /*
+ +                               * We need to mark new file size in case
+ +                               * this function will be interrupted/aborted
+ +                               * later on. And we may do this only for
+ +                               * holes.
+ +                               */
                                 inode->i_size +=
                                     inode->i_sb->s_blocksize * blocks_needed;
                         }
@@@ -1100,13 -995,13 +1100,13 @@@
                 if (done == 1)
                         break;
   
- -              /* this loop could log more blocks than we had originally asked
- -               ** for.  So, we have to allow the transaction to end if it is
- -               ** too big or too full.  Update the inode so things are
- -               ** consistent if we crash before the function returns
- -               **
- -               ** release the path so that anybody waiting on the path before
- -               ** ending their transaction will be able to continue.
+ +              /*
+ +               * this loop could log more blocks than we had originally
+ +               * asked for.  So, we have to allow the transaction to end
+ +               * if it is too big or too full.  Update the inode so things
+ +               * are consistent if we crash before the function returns
+ +               * release the path so that anybody waiting on the path before
+ +               * ending their transaction will be able to continue.
                  */
                 if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
                         retval = restart_transaction(th, inode, &path);
@@@ -1136,14 -1031,14 +1136,14 @@@
                         goto failure;
                 }
                 bh = get_last_bh(&path);
- -              ih = get_ih(&path);
- -              item = get_item(&path);
+ +              ih = tp_item_head(&path);
+ +              item = tp_item_body(&path);
                 pos_in_item = path.pos_in_item;
         } while (1);
   
         retval = 0;
   
- -      failure:
+ +failure:
         if (th && (!dangle || (retval && !th->t_trans_id))) {
                 int err;
                 if (th->t_trans_id)
@@@ -1165,10 -1060,8 +1165,10 @@@ reiserfs_readpages(struct file *file, s
         return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
   }
   
- -/* Compute real number of used bytes by file
- - * Following three functions can go away when we'll have enough space in stat item
+ +/*
+ + * Compute real number of used bytes by file
+ + * Following three functions can go away when we'll have enough space in
+ + * stat item
    */
   static int real_space_diff(struct inode *inode, int sd_size)
   {
@@@ -1178,14 -1071,13 +1178,14 @@@
         if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
                 return sd_size;
   
- -      /* End of file is also in full block with indirect reference, so round
- -       ** up to the next block.
- -       **
- -       ** there is just no way to know if the tail is actually packed
- -       ** on the file, so we have to assume it isn't.  When we pack the
- -       ** tail, we add 4 bytes to pretend there really is an unformatted
- -       ** node pointer
+ +      /*
+ +       * End of file is also in full block with indirect reference, so round
+ +       * up to the next block.
+ +       *
+ +       * there is just no way to know if the tail is actually packed
+ +       * on the file, so we have to assume it isn't.  When we pack the
+ +       * tail, we add 4 bytes to pretend there really is an unformatted
+ +       * node pointer
          */
         bytes =
             ((inode->i_size +
@@@ -1216,36 -1108,36 +1216,36 @@@ static inline ulong to_fake_used_blocks
                 bytes += (loff_t) 511;
         }
   
- -      /* files from before the quota patch might i_blocks such that
- -       ** bytes < real_space.  Deal with that here to prevent it from
- -       ** going negative.
+ +      /*
+ +       * files from before the quota patch might i_blocks such that
+ +       * bytes < real_space.  Deal with that here to prevent it from
+ +       * going negative.
          */
         if (bytes < real_space)
                 return 0;
         return (bytes - real_space) >> 9;
   }
   
- -//
- -// BAD: new directories have stat data of new type and all other items
- -// of old type. Version stored in the inode says about body items, so
- -// in update_stat_data we can not rely on inode, but have to check
- -// item version directly
- -//
+ +/*
+ + * BAD: new directories have stat data of new type and all other items
+ + * of old type. Version stored in the inode says about body items, so
+ + * in update_stat_data we can not rely on inode, but have to check
+ + * item version directly
+ + */
   
- -// called by read_locked_inode
+ +/* called by read_locked_inode */
   static void init_inode(struct inode *inode, struct treepath *path)
   {
         struct buffer_head *bh;
         struct item_head *ih;
         __u32 rdev;
- -      //int version = ITEM_VERSION_1;
   
         bh = PATH_PLAST_BUFFER(path);
- -      ih = PATH_PITEM_HEAD(path);
+ +      ih = tp_item_head(path);
   
- -      copy_key(INODE_PKEY(inode), &(ih->ih_key));
+ +      copy_key(INODE_PKEY(inode), &ih->ih_key);
   
- -      INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
+ +      INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
         REISERFS_I(inode)->i_flags = 0;
         REISERFS_I(inode)->i_prealloc_block = 0;
         REISERFS_I(inode)->i_prealloc_count = 0;
@@@ -1255,7 -1147,7 +1255,7 @@@
   
         if (stat_data_v1(ih)) {
                 struct stat_data_v1 *sd =
- -                  (struct stat_data_v1 *)B_I_PITEM(bh, ih);
+ +                  (struct stat_data_v1 *)ih_item_body(bh, ih);
                 unsigned long blocks;
   
                 set_inode_item_key_version(inode, KEY_FORMAT_3_5);
@@@ -1276,26 -1168,20 +1276,26 @@@
                 inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
                 blocks = (inode->i_size + 511) >> 9;
                 blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
+ +
+ +              /*
+ +               * there was a bug in <=3.5.23 when i_blocks could take
+ +               * negative values. Starting from 3.5.17 this value could
+ +               * even be stored in stat data. For such files we set
+ +               * i_blocks based on file size. Just 2 notes: this can be
+ +               * wrong for sparse files. On-disk value will be only
+ +               * updated if file's inode will ever change
+ +               */
                 if (inode->i_blocks > blocks) {
- -                      // there was a bug in <=3.5.23 when i_blocks could take negative
- -                      // values. Starting from 3.5.17 this value could even be stored in
- -                      // stat data. For such files we set i_blocks based on file
- -                      // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
- -                      // only updated if file's inode will ever change
                         inode->i_blocks = blocks;
                 }
   
                 rdev = sd_v1_rdev(sd);
                 REISERFS_I(inode)->i_first_direct_byte =
                     sd_v1_first_direct_byte(sd);
- -              /* an early bug in the quota code can give us an odd number for the
- -               ** block count.  This is incorrect, fix it here.
+ +
+ +              /*
+ +               * an early bug in the quota code can give us an odd
+ +               * number for the block count.  This is incorrect, fix it here.
                  */
                 if (inode->i_blocks & 1) {
                         inode->i_blocks++;
@@@ -1303,17 -1189,13 +1303,17 @@@
                 inode_set_bytes(inode,
                                 to_real_used_space(inode, inode->i_blocks,
                                                    SD_V1_SIZE));
- -              /* nopack is initially zero for v1 objects. For v2 objects,
- -                 nopack is initialised from sd_attrs */
+ +              /*
+ +               * nopack is initially zero for v1 objects. For v2 objects,
+ +               * nopack is initialised from sd_attrs
+ +               */
                 REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
         } else {
- -              // new stat data found, but object may have old items
- -              // (directories and symlinks)
- -              struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
+ +              /*
+ +               * new stat data found, but object may have old items
+ +               * (directories and symlinks)
+ +               */
+ +              struct stat_data *sd = (struct stat_data *)ih_item_body(bh, ih);
   
                 inode->i_mode = sd_v2_mode(sd);
                 set_nlink(inode, sd_v2_nlink(sd));
@@@ -1343,10 -1225,8 +1343,10 @@@
                 inode_set_bytes(inode,
                                 to_real_used_space(inode, inode->i_blocks,
                                                    SD_V2_SIZE));
- -              /* read persistent inode attributes from sd and initialise
- -                 generic inode flags from them */
+ +              /*
+ +               * read persistent inode attributes from sd and initialise
+ +               * generic inode flags from them
+ +               */
                 REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
                 sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
         }
@@@ -1369,7 -1249,7 +1369,7 @@@
         }
   }
   
- -// update new stat data with inode fields
+ +/* update new stat data with inode fields */
   static void inode2sd(void *sd, struct inode *inode, loff_t size)
   {
         struct stat_data *sd_v2 = (struct stat_data *)sd;
@@@ -1393,7 -1273,7 +1393,7 @@@
         set_sd_v2_attrs(sd_v2, flags);
   }
   
- -// used to copy inode's fields to old stat data
+ +/* used to copy inode's fields to old stat data */
   static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
   {
         struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
@@@ -1412,15 -1292,14 +1412,15 @@@
         else
                 set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
   
- -      // Sigh. i_first_direct_byte is back
+ +      /* Sigh. i_first_direct_byte is back */
         set_sd_v1_first_direct_byte(sd_v1,
                                     REISERFS_I(inode)->i_first_direct_byte);
   }
   
- -/* NOTE, you must prepare the buffer head before sending it here,
- -** and then log it after the call
- -*/
+ +/*
+ + * NOTE, you must prepare the buffer head before sending it here,
+ + * and then log it after the call
+ + */
   static void update_stat_data(struct treepath *path, struct inode *inode,
                              loff_t size)
   {
@@@ -1428,17 -1307,17 +1428,17 @@@
         struct item_head *ih;
   
         bh = PATH_PLAST_BUFFER(path);
- -      ih = PATH_PITEM_HEAD(path);
+ +      ih = tp_item_head(path);
   
         if (!is_statdata_le_ih(ih))
                 reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
                                INODE_PKEY(inode), ih);
   
+ +      /* path points to old stat data */
         if (stat_data_v1(ih)) {
- -              // path points to old stat data
- -              inode2sd_v1(B_I_PITEM(bh, ih), inode, size);
+ +              inode2sd_v1(ih_item_body(bh, ih), inode, size);
         } else {
- -              inode2sd(B_I_PITEM(bh, ih), inode, size);
+ +              inode2sd(ih_item_body(bh, ih), inode, size);
         }
   
         return;
@@@ -1456,8 -1335,7 +1456,8 @@@ void reiserfs_update_sd_size(struct rei
   
         BUG_ON(!th->t_trans_id);
   
- -      make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);        //key type is unimportant
+ +      /* key type is unimportant */
+ +      make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);
   
         for (;;) {
                 int pos;
@@@ -1485,48 -1363,45 +1485,48 @@@
                         return;
                 }
   
- -              /* sigh, prepare_for_journal might schedule.  When it schedules the
- -               ** FS might change.  We have to detect that, and loop back to the
- -               ** search if the stat data item has moved
+ +              /*
+ +               * sigh, prepare_for_journal might schedule.  When it
+ +               * schedules the FS might change.  We have to detect that,
+ +               * and loop back to the search if the stat data item has moved
                  */
                 bh = get_last_bh(&path);
- -              ih = get_ih(&path);
+ +              ih = tp_item_head(&path);
                 copy_item_head(&tmp_ih, ih);
                 fs_gen = get_generation(inode->i_sb);
                 reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
+ +
+ +              /* Stat_data item has been moved after scheduling. */
                 if (fs_changed(fs_gen, inode->i_sb)
                     && item_moved(&tmp_ih, &path)) {
                         reiserfs_restore_prepared_buffer(inode->i_sb, bh);
- -                      continue;       /* Stat_data item has been moved after scheduling. */
+ +                      continue;
                 }
                 break;
         }
         update_stat_data(&path, inode, size);
- -      journal_mark_dirty(th, th->t_super, bh);
+ +      journal_mark_dirty(th, bh);
         pathrelse(&path);
         return;
   }
   
- -/* reiserfs_read_locked_inode is called to read the inode off disk, and it
- -** does a make_bad_inode when things go wrong.  But, we need to make sure
- -** and clear the key in the private portion of the inode, otherwise a
- -** corresponding iput might try to delete whatever object the inode last
- -** represented.
- -*/
+ +/*
+ + * reiserfs_read_locked_inode is called to read the inode off disk, and it
+ + * does a make_bad_inode when things go wrong.  But, we need to make sure
+ + * and clear the key in the private portion of the inode, otherwise a
+ + * corresponding iput might try to delete whatever object the inode last
+ + * represented.
+ + */
   static void reiserfs_make_bad_inode(struct inode *inode)
   {
         memset(INODE_PKEY(inode), 0, KEY_SIZE);
         make_bad_inode(inode);
   }
   
- -//
- -// initially this function was derived from minix or ext2's analog and
- -// evolved as the prototype did
- -//
- -
+ +/*
+ + * initially this function was derived from minix or ext2's analog and
+ + * evolved as the prototype did
+ + */
   int reiserfs_init_locked_inode(struct inode *inode, void *p)
   {
         struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
@@@ -1535,10 -1410,8 +1535,10 @@@
         return 0;
   }
   
- -/* looks for stat data in the tree, and fills up the fields of in-core
- -   inode stat data fields */
+ +/*
+ + * looks for stat data in the tree, and fills up the fields of in-core
+ + * inode stat data fields
+ + */
   void reiserfs_read_locked_inode(struct inode *inode,
                                 struct reiserfs_iget_args *args)
   {
@@@ -1549,10 -1422,8 +1549,10 @@@
   
         dirino = args->dirid;
   
- -      /* set version 1, version 2 could be used too, because stat data
- -         key is the same in both versions */
+ +      /*
+ +       * set version 1, version 2 could be used too, because stat data
+ +       * key is the same in both versions
+ +       */
         key.version = KEY_FORMAT_3_5;
         key.on_disk_key.k_dir_id = dirino;
         key.on_disk_key.k_objectid = inode->i_ino;
@@@ -1568,9 -1439,8 +1568,9 @@@
                 reiserfs_make_bad_inode(inode);
                 return;
         }
+ +
+ +      /* a stale NFS handle can trigger this without it being an error */
         if (retval != ITEM_FOUND) {
- -              /* a stale NFS handle can trigger this without it being an error */
                 pathrelse(&path_to_sd);
                 reiserfs_make_bad_inode(inode);
                 clear_nlink(inode);
@@@ -1579,25 -1449,20 +1579,25 @@@
   
         init_inode(inode, &path_to_sd);
   
- -      /* It is possible that knfsd is trying to access inode of a file
- -         that is being removed from the disk by some other thread. As we
- -         update sd on unlink all that is required is to check for nlink
- -         here. This bug was first found by Sizif when debugging
- -         SquidNG/Butterfly, forgotten, and found again after Philippe
- -         Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
- -
- -         More logical fix would require changes in fs/inode.c:iput() to
- -         remove inode from hash-table _after_ fs cleaned disk stuff up and
- -         in iget() to return NULL if I_FREEING inode is found in
- -         hash-table. */
- -      /* Currently there is one place where it's ok to meet inode with
- -         nlink==0: processing of open-unlinked and half-truncated files
- -         during mount (fs/reiserfs/super.c:finish_unfinished()). */
+ +      /*
+ +       * It is possible that knfsd is trying to access inode of a file
+ +       * that is being removed from the disk by some other thread. As we
+ +       * update sd on unlink all that is required is to check for nlink
+ +       * here. This bug was first found by Sizif when debugging
+ +       * SquidNG/Butterfly, forgotten, and found again after Philippe
+ +       * Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
+ +
+ +       * More logical fix would require changes in fs/inode.c:iput() to
+ +       * remove inode from hash-table _after_ fs cleaned disk stuff up and
+ +       * in iget() to return NULL if I_FREEING inode is found in
+ +       * hash-table.
+ +       */
+ +
+ +      /*
+ +       * Currently there is one place where it's ok to meet inode with
+ +       * nlink==0: processing of open-unlinked and half-truncated files
+ +       * during mount (fs/reiserfs/super.c:finish_unfinished()).
+ +       */
         if ((inode->i_nlink == 0) &&
             !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
                 reiserfs_warning(inode->i_sb, "vs-13075",
@@@ -1607,8 -1472,7 +1607,8 @@@
                 reiserfs_make_bad_inode(inode);
         }
   
- -      reiserfs_check_path(&path_to_sd);       /* init inode should be relsing */
+ +      /* init inode should be relsing */
+ +      reiserfs_check_path(&path_to_sd);
   
         /*
          * Stat data v1 doesn't support ACLs.
@@@ -1617,7 -1481,7 +1617,7 @@@
                 cache_no_acl(inode);
   }
   
- -/**
+ +/*
    * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
    *
    * @inode:    inode from hash table to check
@@@ -1692,8 -1556,7 +1692,8 @@@ static struct dentry *reiserfs_get_dent
   struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
                 int fh_len, int fh_type)
   {
- -      /* fhtype happens to reflect the number of u32s encoded.
+ +      /*
+ +       * fhtype happens to reflect the number of u32s encoded.
          * due to a bug in earlier code, fhtype might indicate there
          * are more u32s then actually fitted.
          * so if fhtype seems to be more than len, reduce fhtype.
@@@ -1762,16 -1625,13 +1762,16 @@@ int reiserfs_encode_fh(struct inode *in
         return *lenp;
   }
   
- -/* looks for stat data, then copies fields to it, marks the buffer
- -   containing stat data as dirty */
- -/* reiserfs inodes are never really dirty, since the dirty inode call
- -** always logs them.  This call allows the VFS inode marking routines
- -** to properly mark inodes for datasync and such, but only actually
- -** does something when called for a synchronous update.
- -*/
+ +/*
+ + * looks for stat data, then copies fields to it, marks the buffer
+ + * containing stat data as dirty
+ + */
+ +/*
+ + * reiserfs inodes are never really dirty, since the dirty inode call
+ + * always logs them.  This call allows the VFS inode marking routines
+ + * to properly mark inodes for datasync and such, but only actually
+ + * does something when called for a synchronous update.
+ + */
   int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
   {
         struct reiserfs_transaction_handle th;
@@@ -1779,28 -1639,24 +1779,28 @@@
   
         if (inode->i_sb->s_flags & MS_RDONLY)
                 return -EROFS;
- -      /* memory pressure can sometimes initiate write_inode calls with sync == 1,
- -       ** these cases are just when the system needs ram, not when the
- -       ** inode needs to reach disk for safety, and they can safely be
- -       ** ignored because the altered inode has already been logged.
+ +      /*
+ +       * memory pressure can sometimes initiate write_inode calls with
+ +       * sync == 1,
+ +       * these cases are just when the system needs ram, not when the
+ +       * inode needs to reach disk for safety, and they can safely be
+ +       * ignored because the altered inode has already been logged.
          */
         if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
                 reiserfs_write_lock(inode->i_sb);
                 if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
                         reiserfs_update_sd(&th, inode);
- -                      journal_end_sync(&th, inode->i_sb, jbegin_count);
+ +                      journal_end_sync(&th);
                 }
                 reiserfs_write_unlock(inode->i_sb);
         }
         return 0;
   }
   
- -/* stat data of new object is inserted already, this inserts the item
- -   containing "." and ".." entries */
+ +/*
+ + * stat data of new object is inserted already, this inserts the item
+ + * containing "." and ".." entries
+ + */
   static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
                                   struct inode *inode,
                                   struct item_head *ih, struct treepath *path,
@@@ -1818,11 -1674,9 +1818,11 @@@
                       le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
                       TYPE_DIRENTRY, 3 /*key length */ );
   
- -      /* compose item head for new item. Directories consist of items of
- -         old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
- -         is done by reiserfs_new_inode */
+ +      /*
+ +       * compose item head for new item. Directories consist of items of
+ +       * old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
+ +       * is done by reiserfs_new_inode
+ +       */
         if (old_format_only(sb)) {
                 make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
                                   TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
@@@ -1860,12 -1714,9 +1860,12 @@@
         return reiserfs_insert_item(th, path, &key, ih, inode, body);
   }
   
- -/* stat data of object has been inserted, this inserts the item
- -   containing the body of symlink */
- -static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode,  /* Inode of symlink */
+ +/*
+ + * stat data of object has been inserted, this inserts the item
+ + * containing the body of symlink
+ + */
+ +static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th,
+ +                              struct inode *inode,
                                 struct item_head *ih,
                                 struct treepath *path, const char *symname,
                                 int item_len)
@@@ -1903,26 -1754,15 +1903,26 @@@
         return reiserfs_insert_item(th, path, &key, ih, inode, symname);
   }
   
- -/* inserts the stat data into the tree, and then calls
- -   reiserfs_new_directory (to insert ".", ".." item if new object is
- -   directory) or reiserfs_new_symlink (to insert symlink body if new
- -   object is symlink) or nothing (if new object is regular file)
- -
- -   NOTE! uid and gid must already be set in the inode.  If we return
- -   non-zero due to an error, we have to drop the quota previously allocated
- -   for the fresh inode.  This can only be done outside a transaction, so
- -   if we return non-zero, we also end the transaction.  */
+ +/*
+ + * inserts the stat data into the tree, and then calls
+ + * reiserfs_new_directory (to insert ".", ".." item if new object is
+ + * directory) or reiserfs_new_symlink (to insert symlink body if new
+ + * object is symlink) or nothing (if new object is regular file)
+ +
+ + * NOTE! uid and gid must already be set in the inode.  If we return
+ + * non-zero due to an error, we have to drop the quota previously allocated
+ + * for the fresh inode.  This can only be done outside a transaction, so
+ + * if we return non-zero, we also end the transaction.
+ + *
+ + * @th: active transaction handle
+ + * @dir: parent directory for new inode
+ + * @mode: mode of new inode
+ + * @symname: symlink contents if inode is symlink
+ + * @isize: 0 for regular file, EMPTY_DIR_SIZE for dirs, strlen(symname) for
+ + *         symlinks
+ + * @inode: inode to be filled
+ + * @security: optional security context to associate with this inode
+ + */
   int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
                        struct inode *dir, umode_t mode, const char *symname,
                        /* 0 for regular, EMTRY_DIR_SIZE for dirs,
@@@ -1967,7 -1807,7 +1967,7 @@@
         else
                 make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
                                   TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
- -      memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
+ +      memcpy(INODE_PKEY(inode), &ih.ih_key, KEY_SIZE);
         args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
   
         depth = reiserfs_write_unlock_nested(inode->i_sb);
@@@ -1980,11 -1820,10 +1980,11 @@@
         }
   
         if (old_format_only(sb))
- -              /* not a perfect generation count, as object ids can be reused, but
- -               ** this is as good as reiserfs can do right now.
- -               ** note that the private part of inode isn't filled in yet, we have
- -               ** to use the directory.
+ +              /*
+ +               * not a perfect generation count, as object ids can be reused,
+ +               * but this is as good as reiserfs can do right now.
+ +               * note that the private part of inode isn't filled in yet,
+ +               * we have to use the directory.
                  */
                 inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
         else
@@@ -2011,7 -1850,7 +2011,7 @@@
         REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
             U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
   
- -      INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
+ +      INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
         REISERFS_I(inode)->i_flags = 0;
         REISERFS_I(inode)->i_prealloc_block = 0;
         REISERFS_I(inode)->i_prealloc_count = 0;
@@@ -2039,9 -1878,9 +2039,9 @@@
                 goto out_bad_inode;
         }
         if (old_format_only(sb)) {
+ +              /* i_uid or i_gid is too big to be stored in stat data v3.5 */
                 if (i_uid_read(inode) & ~0xffff || i_gid_read(inode) & ~0xffff) {
                         pathrelse(&path_to_key);
- -                      /* i_uid or i_gid is too big to be stored in stat data v3.5 */
                         err = -EINVAL;
                         goto out_bad_inode;
                 }
@@@ -2049,11 -1888,9 +2049,11 @@@
         } else {
                 inode2sd(&sd, inode, inode->i_size);
         }
- -      // store in in-core inode the key of stat data and version all
- -      // object items will have (directory items will have old offset
- -      // format, other new objects will consist of new items)
+ +      /*
+ +       * store in in-core inode the key of stat data and version all
+ +       * object items will have (directory items will have old offset
+ +       * format, other new objects will consist of new items)
+ +       */
         if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
                 set_inode_item_key_version(inode, KEY_FORMAT_3_5);
         else
@@@ -2097,7 -1934,7 +2097,7 @@@
         if (retval) {
                 err = retval;
                 reiserfs_check_path(&path_to_key);
- -              journal_end(th, th->t_super, th->t_blocks_allocated);
+ +              journal_end(th);
                 goto out_inserted_sd;
         }
   
@@@ -2108,7 -1945,7 +2108,7 @@@
                 if (retval) {
                         err = retval;
                         reiserfs_check_path(&path_to_key);
- -                      journal_end(th, th->t_super, th->t_blocks_allocated);
+ +                      journal_end(th);
                         goto out_inserted_sd;
                 }
         } else if (inode->i_sb->s_flags & MS_POSIXACL) {
@@@ -2125,7 -1962,8 +2125,7 @@@
                 if (retval) {
                         err = retval;
                         reiserfs_check_path(&path_to_key);
- -                      retval = journal_end(th, th->t_super,
- -                                           th->t_blocks_allocated);
+ +                      retval = journal_end(th);
                         if (retval)
                                 err = retval;
                         goto out_inserted_sd;
@@@ -2137,7 -1975,11 +2137,7 @@@
   
         return 0;
   
- -/* it looks like you can easily compress these two goto targets into
- - * one.  Keeping it like this doesn't actually hurt anything, and they
- - * are place holders for what the quota code actually needs.
- - */
- -      out_bad_inode:
+ +out_bad_inode:
         /* Invalidate the object, nothing was inserted yet */
         INODE_PKEY(inode)->k_objectid = 0;
   
@@@ -2146,19 -1988,16 +2146,19 @@@
         dquot_free_inode(inode);
         reiserfs_write_lock_nested(inode->i_sb, depth);
   
- -      out_end_trans:
- -      journal_end(th, th->t_super, th->t_blocks_allocated);
- -      /* Drop can be outside and it needs more credits so it's better to have it outside */
+ +out_end_trans:
+ +      journal_end(th);
+ +      /*
+ +       * Drop can be outside and it needs more credits so it's better
+ +       * to have it outside
+ +       */
         depth = reiserfs_write_unlock_nested(inode->i_sb);
         dquot_drop(inode);
         reiserfs_write_lock_nested(inode->i_sb, depth);
         inode->i_flags |= S_NOQUOTA;
         make_bad_inode(inode);
   
- -      out_inserted_sd:
+ +out_inserted_sd:
         clear_nlink(inode);
         th->t_trans_id = 0;     /* so the caller can't use this handle later */
         unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
@@@ -2167,26 -2006,25 +2167,26 @@@
   }
   
   /*
- -** finds the tail page in the page cache,
- -** reads the last block in.
- -**
- -** On success, page_result is set to a locked, pinned page, and bh_result
- -** is set to an up to date buffer for the last block in the file.  returns 0.
- -**
- -** tail conversion is not done, so bh_result might not be valid for writing
- -** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
- -** trying to write the block.
- -**
- -** on failure, nonzero is returned, page_result and bh_result are untouched.
- -*/
+ + * finds the tail page in the page cache,
+ + * reads the last block in.
+ + *
+ + * On success, page_result is set to a locked, pinned page, and bh_result
+ + * is set to an up to date buffer for the last block in the file.  returns 0.
+ + *
+ + * tail conversion is not done, so bh_result might not be valid for writing
+ + * check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
+ + * trying to write the block.
+ + *
+ + * on failure, nonzero is returned, page_result and bh_result are untouched.
+ + */
   static int grab_tail_page(struct inode *inode,
                           struct page **page_result,
                           struct buffer_head **bh_result)
   {
   
- -      /* we want the page with the last byte in the file,
- -       ** not the page that will hold the next byte for appending
+ +      /*
+ +       * we want the page with the last byte in the file,
+ +       * not the page that will hold the next byte for appending
          */
         unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
         unsigned long pos = 0;
@@@ -2198,11 -2036,10 +2198,11 @@@
         struct page *page;
         int error;
   
- -      /* we know that we are only called with inode->i_size > 0.
- -       ** we also know that a file tail can never be as big as a block
- -       ** If i_size % blocksize == 0, our file is currently block aligned
- -       ** and it won't need converting or zeroing after a truncate.
+ +      /*
+ +       * we know that we are only called with inode->i_size > 0.
+ +       * we also know that a file tail can never be as big as a block
+ +       * If i_size % blocksize == 0, our file is currently block aligned
+ +       * and it won't need converting or zeroing after a truncate.
          */
         if ((offset & (blocksize - 1)) == 0) {
                 return -ENOENT;
@@@ -2231,11 -2068,10 +2231,11 @@@
         } while (bh != head);
   
         if (!buffer_uptodate(bh)) {
- -              /* note, this should never happen, prepare_write should
- -               ** be taking care of this for us.  If the buffer isn't up to date,
- -               ** I've screwed up the code to find the buffer, or the code to
- -               ** call prepare_write
+ +              /*
+ +               * note, this should never happen, prepare_write should be
+ +               * taking care of this for us.  If the buffer isn't up to
+ +               * date, I've screwed up the code to find the buffer, or the
+ +               * code to call prepare_write
                  */
                 reiserfs_error(inode->i_sb, "clm-6000",
                                "error reading block %lu", bh->b_blocknr);
@@@ -2245,21 -2081,21 +2245,21 @@@
         *bh_result = bh;
         *page_result = page;
   
- -      out:
+ +out:
         return error;
   
- -      unlock:
+ +unlock:
         unlock_page(page);
         page_cache_release(page);
         return error;
   }
   
   /*
- -** vfs version of truncate file.  Must NOT be called with
- -** a transaction already started.
- -**
- -** some code taken from block_truncate_page
- -*/
+ + * vfs version of truncate file.  Must NOT be called with
+ + * a transaction already started.
+ + *
+ + * some code taken from block_truncate_page
+ + */
   int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
   {
         struct reiserfs_transaction_handle th;
@@@ -2277,11 -2113,9 +2277,11 @@@
         if (inode->i_size > 0) {
                 error = grab_tail_page(inode, &page, &bh);
                 if (error) {
- -                      // -ENOENT means we truncated past the end of the file,
- -                      // and get_block_create_0 could not find a block to read in,
- -                      // which is ok.
+ +                      /*
+ +                       * -ENOENT means we truncated past the end of the
+ +                       * file, and get_block_create_0 could not find a
+ +                       * block to read in, which is ok.
+ +                       */
                         if (error != -ENOENT)
                                 reiserfs_error(inode->i_sb, "clm-6001",
                                                "grab_tail_page failed %d",
@@@ -2291,33 -2125,29 +2291,33 @@@
                 }
         }
   
- -      /* so, if page != NULL, we have a buffer head for the offset at
- -       ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
- -       ** then we have an unformatted node.  Otherwise, we have a direct item,
- -       ** and no zeroing is required on disk.  We zero after the truncate,
- -       ** because the truncate might pack the item anyway
- -       ** (it will unmap bh if it packs).
+ +      /*
+ +       * so, if page != NULL, we have a buffer head for the offset at
+ +       * the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
+ +       * then we have an unformatted node.  Otherwise, we have a direct item,
+ +       * and no zeroing is required on disk.  We zero after the truncate,
+ +       * because the truncate might pack the item anyway
+ +       * (it will unmap bh if it packs).
+ +       *
+ +       * it is enough to reserve space in transaction for 2 balancings:
+ +       * one for "save" link adding and another for the first
+ +       * cut_from_item. 1 is for update_sd
          */
- -      /* it is enough to reserve space in transaction for 2 balancings:
- -         one for "save" link adding and another for the first
- -         cut_from_item. 1 is for update_sd */
         error = journal_begin(&th, inode->i_sb,
                               JOURNAL_PER_BALANCE_CNT * 2 + 1);
         if (error)
                 goto out;
         reiserfs_update_inode_transaction(inode);
         if (update_timestamps)
- -              /* we are doing real truncate: if the system crashes before the last
- -                 transaction of truncating gets committed - on reboot the file
- -                 either appears truncated properly or not truncated at all */
+ +              /*
+ +               * we are doing real truncate: if the system crashes
+ +               * before the last transaction of truncating gets committed
+ +               * - on reboot the file either appears truncated properly
+ +               * or not truncated at all
+ +               */
                 add_save_link(&th, inode, 1);
         err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps);
- -      error =
- -          journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
+ +      error = journal_end(&th);
         if (error)
                 goto out;
   
@@@ -2350,7 -2180,7 +2350,7 @@@
         reiserfs_write_unlock(inode->i_sb);
   
         return 0;
- -      out:
+ +out:
         if (page) {
                 unlock_page(page);
                 page_cache_release(page);
@@@ -2382,10 -2212,7 +2382,10 @@@ static int map_block_for_writepage(stru
         int copy_size;
         int trans_running = 0;
   
- -      /* catch places below that try to log something without starting a trans */
+ +      /*
+ +       * catch places below that try to log something without
+ +       * starting a trans
+ +       */
         th.t_trans_id = 0;
   
         if (!buffer_uptodate(bh_result)) {
@@@ -2393,11 -2220,11 +2393,11 @@@
         }
   
         kmap(bh_result->b_page);
- -      start_over:
+ +start_over:
         reiserfs_write_lock(inode->i_sb);
         make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
   
- -      research:
+ +research:
         retval = search_for_position_by_key(inode->i_sb, &key, &path);
         if (retval != POSITION_FOUND) {
                 use_get_block = 1;
@@@ -2405,8 -2232,8 +2405,8 @@@
         }
   
         bh = get_last_bh(&path);
- -      ih = get_ih(&path);
- -      item = get_item(&path);
+ +      ih = tp_item_head(&path);
+ +      item = tp_item_body(&path);
         pos_in_item = path.pos_in_item;
   
         /* we've found an unformatted node */
@@@ -2454,10 -2281,10 +2454,10 @@@
                         goto research;
                 }
   
- -              memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied,
+ +              memcpy(ih_item_body(bh, ih) + pos_in_item, p + bytes_copied,
                        copy_size);
   
- -              journal_mark_dirty(&th, inode->i_sb, bh);
+ +              journal_mark_dirty(&th, bh);
                 bytes_copied += copy_size;
                 set_block_dev_mapped(bh_result, 0, inode);
   
@@@ -2477,10 -2304,10 +2477,10 @@@
         }
         retval = 0;
   
- -      out:
+ +out:
         pathrelse(&path);
         if (trans_running) {
- -              int err = journal_end(&th, inode->i_sb, jbegin_count);
+ +              int err = journal_end(&th);
                 if (err)
                         retval = err;
                 trans_running = 0;
@@@ -2504,8 -2331,7 +2504,8 @@@
         kunmap(bh_result->b_page);
   
         if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
- -              /* we've copied data from the page into the direct item, so the
+ +              /*
+ +               * we've copied data from the page into the direct item, so the
                  * buffer in the page is now clean, mark it to reflect that.
                  */
                 lock_buffer(bh_result);
@@@ -2544,8 -2370,7 +2544,8 @@@ static int reiserfs_write_full_page(str
                 return 0;
         }
   
- -      /* The page dirty bit is cleared before writepage is called, which
+ +      /*
+ +       * The page dirty bit is cleared before writepage is called, which
          * means we have to tell create_empty_buffers to make dirty buffers
          * The page really should be up to date at this point, so tossing
          * in the BH_Uptodate is just a sanity check.
@@@ -2556,9 -2381,8 +2556,9 @@@
         }
         head = page_buffers(page);
   
- -      /* last page in the file, zero out any contents past the
- -       ** last byte in the file
+ +      /*
+ +       * last page in the file, zero out any contents past the
+ +       * last byte in the file
          */
         if (page->index >= end_index) {
                 unsigned last_offset;
@@@ -2588,8 -2412,7 +2588,8 @@@
                            (!buffer_mapped(bh) || (buffer_mapped(bh)
                                                        && bh->b_blocknr ==
                                                        0))) {
- -                      /* not mapped yet, or it points to a direct item, search
+ +                      /*
+ +                       * not mapped yet, or it points to a direct item, search
                          * the btree for the mapping info, and log any direct
                          * items found
                          */
@@@ -2627,11 -2450,10 +2627,11 @@@
   
                 if (checked) {
                         reiserfs_prepare_for_journal(s, bh, 1);
- -                      journal_mark_dirty(&th, s, bh);
+ +                      journal_mark_dirty(&th, bh);
                         continue;
                 }
- -              /* from this point on, we know the buffer is mapped to a
+ +              /*
+ +               * from this point on, we know the buffer is mapped to a
                  * real block and not a direct item
                  */
                 if (wbc->sync_mode != WB_SYNC_NONE) {
@@@ -2650,7 -2472,7 +2650,7 @@@
         } while ((bh = bh->b_this_page) != head);
   
         if (checked) {
- -              error = journal_end(&th, s, bh_per_page + 1);
+ +              error = journal_end(&th);
                 reiserfs_write_unlock(s);
                 if (error)
                         goto fail;
@@@ -2675,7 -2497,7 +2675,7 @@@
         } while (bh != head);
   
         error = 0;
- -      done:
+ +done:
         if (nr == 0) {
                 /*
                  * if this page only had a direct item, it is very possible for
@@@ -2697,9 -2519,8 +2697,9 @@@
         }
         return error;
   
- -      fail:
- -      /* catches various errors, we need to make sure any valid dirty blocks
+ +fail:
+ +      /*
+ +       * catches various errors, we need to make sure any valid dirty blocks
          * get to the media.  The page is currently locked and not marked for
          * writeback
          */
@@@ -2712,8 -2533,8 +2712,8 @@@
                         mark_buffer_async_write(bh);
                 } else {
                         /*
- -                       * clear any dirty bits that might have come from getting
- -                       * attached to a dirty page
+ +                       * clear any dirty bits that might have come from
+ +                       * getting attached to a dirty page
                          */
                         clear_buffer_dirty(bh);
                 }
@@@ -2793,18 -2614,15 +2793,18 @@@ static int reiserfs_write_begin(struct 
         ret = __block_write_begin(page, pos, len, reiserfs_get_block);
         if (ret && reiserfs_transaction_running(inode->i_sb)) {
                 struct reiserfs_transaction_handle *th = current->journal_info;
- -              /* this gets a little ugly.  If reiserfs_get_block returned an
- -               * error and left a transacstion running, we've got to close it,
- -               * and we've got to free handle if it was a persistent transaction.
+ +              /*
+ +               * this gets a little ugly.  If reiserfs_get_block returned an
+ +               * error and left a transacstion running, we've got to close
+ +               * it, and we've got to free handle if it was a persistent
+ +               * transaction.
                  *
                  * But, if we had nested into an existing transaction, we need
                  * to just drop the ref count on the handle.
                  *
                  * If old_ref == 0, the transaction is from reiserfs_get_block,
- -               * and it was a persistent trans.  Otherwise, it was nested above.
+ +               * and it was a persistent trans.  Otherwise, it was nested
+ +               * above.
                  */
                 if (th->t_refcount > old_ref) {
                         if (old_ref)
@@@ -2853,18 -2671,15 +2853,18 @@@ int __reiserfs_write_begin(struct page 
         ret = __block_write_begin(page, from, len, reiserfs_get_block);
         if (ret && reiserfs_transaction_running(inode->i_sb)) {
                 struct reiserfs_transaction_handle *th = current->journal_info;
- -              /* this gets a little ugly.  If reiserfs_get_block returned an
- -               * error and left a transacstion running, we've got to close it,
- -               * and we've got to free handle if it was a persistent transaction.
+ +              /*
+ +               * this gets a little ugly.  If reiserfs_get_block returned an
+ +               * error and left a transacstion running, we've got to close
+ +               * it, and we've got to free handle if it was a persistent
+ +               * transaction.
                  *
                  * But, if we had nested into an existing transaction, we need
                  * to just drop the ref count on the handle.
                  *
                  * If old_ref == 0, the transaction is from reiserfs_get_block,
- -               * and it was a persistent trans.  Otherwise, it was nested above.
+ +               * and it was a persistent trans.  Otherwise, it was nested
+ +               * above.
                  */
                 if (th->t_refcount > old_ref) {
                         if (old_ref)
@@@ -2919,20 -2734,17 +2919,20 @@@ static int reiserfs_write_end(struct fi
   
         reiserfs_commit_page(inode, page, start, start + copied);
   
- -      /* generic_commit_write does this for us, but does not update the
- -       ** transaction tracking stuff when the size changes.  So, we have
- -       ** to do the i_size updates here.
+ +      /*
+ +       * generic_commit_write does this for us, but does not update the
+ +       * transaction tracking stuff when the size changes.  So, we have
+ +       * to do the i_size updates here.
          */
         if (pos + copied > inode->i_size) {
                 struct reiserfs_transaction_handle myth;
                 reiserfs_write_lock(inode->i_sb);
                 locked = true;
- -              /* If the file have grown beyond the border where it
- -                 can have a tail, unmark it as needing a tail
- -                 packing */
+ +              /*
+ +               * If the file have grown beyond the border where it
+ +               * can have a tail, unmark it as needing a tail
+ +               * packing
+ +               */
                 if ((have_large_tails(inode->i_sb)
                      && inode->i_size > i_block_size(inode) * 4)
                     || (have_small_tails(inode->i_sb)
@@@ -2947,13 -2759,13 +2947,13 @@@
                 inode->i_size = pos + copied;
                 /*
                  * this will just nest into our transaction.  It's important
- -               * to use mark_inode_dirty so the inode gets pushed around on the
- -               * dirty lists, and so that O_SYNC works as expected
+ +               * to use mark_inode_dirty so the inode gets pushed around on
+ +               * the dirty lists, and so that O_SYNC works as expected
                  */
                 mark_inode_dirty(inode);
                 reiserfs_update_sd(&myth, inode);
                 update_sd = 1;
- -              ret = journal_end(&myth, inode->i_sb, 1);
+ +              ret = journal_end(&myth);
                 if (ret)
                         goto journal_error;
         }
@@@ -2969,7 -2781,7 +2969,7 @@@
                         goto out;
         }
   
- -      out:
+ +out:
         if (locked)
                 reiserfs_write_unlock(inode->i_sb);
         unlock_page(page);
@@@ -2980,7 -2792,7 +2980,7 @@@
   
         return ret == 0 ? copied : ret;
   
- -      journal_error:
+ +journal_error:
         reiserfs_write_unlock(inode->i_sb);
         locked = false;
         if (th) {
@@@ -3010,18 -2822,15 +3010,18 @@@ int reiserfs_commit_write(struct file *
         }
         reiserfs_commit_page(inode, page, from, to);
   
- -      /* generic_commit_write does this for us, but does not update the
- -       ** transaction tracking stuff when the size changes.  So, we have
- -       ** to do the i_size updates here.
+ +      /*
+ +       * generic_commit_write does this for us, but does not update the
+ +       * transaction tracking stuff when the size changes.  So, we have
+ +       * to do the i_size updates here.
          */
         if (pos > inode->i_size) {
                 struct reiserfs_transaction_handle myth;
- -              /* If the file have grown beyond the border where it
- -                 can have a tail, unmark it as needing a tail
- -                 packing */
+ +              /*
+ +               * If the file have grown beyond the border where it
+ +               * can have a tail, unmark it as needing a tail
+ +               * packing
+ +               */
                 if ((have_large_tails(inode->i_sb)
                      && inode->i_size > i_block_size(inode) * 4)
                     || (have_small_tails(inode->i_sb)
@@@ -3036,13 -2845,13 +3036,13 @@@
                 inode->i_size = pos;
                 /*
                  * this will just nest into our transaction.  It's important
- -               * to use mark_inode_dirty so the inode gets pushed around on the
- -               * dirty lists, and so that O_SYNC works as expected
+ +               * to use mark_inode_dirty so the inode gets pushed around
+ +               * on the dirty lists, and so that O_SYNC works as expected
                  */
                 mark_inode_dirty(inode);
                 reiserfs_update_sd(&myth, inode);
                 update_sd = 1;
- -              ret = journal_end(&myth, inode->i_sb, 1);
+ +              ret = journal_end(&myth);
                 if (ret)
                         goto journal_error;
         }
@@@ -3054,10 -2863,10 +3054,10 @@@
                         goto out;
         }
   
- -      out:
+ +out:
         return ret;
   
- -      journal_error:
+ +journal_error:
         if (th) {
                 if (!update_sd)
                         reiserfs_update_sd(th, inode);
@@@ -3115,10 -2924,9 +3115,10 @@@ void i_attrs_to_sd_attrs(struct inode *
         }
   }
   
- -/* decide if this buffer needs to stay around for data logging or ordered
- -** write purposes
- -*/
+ +/*
+ + * decide if this buffer needs to stay around for data logging or ordered
+ + * write purposes
+ + */
   static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
   {
         int ret = 1;
@@@ -3129,8 -2937,7 +3129,8 @@@
         if (!buffer_mapped(bh)) {
                 goto free_jh;
         }
- -      /* the page is locked, and the only places that log a data buffer
+ +      /*
+ +       * the page is locked, and the only places that log a data buffer
          * also lock the page.
          */
         if (reiserfs_file_data_log(inode)) {
@@@ -3145,8 -2952,7 +3145,8 @@@
                 struct reiserfs_journal_list *jl;
                 struct reiserfs_jh *jh = bh->b_private;
   
- -              /* why is this safe?
+ +              /*
+ +               * why is this safe?
                  * reiserfs_setattr updates i_size in the on disk
                  * stat data before allowing vmtruncate to be called.
                  *
@@@ -3163,7 -2969,7 +3163,7 @@@
                     && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
                         ret = 0;
         }
- -      free_jh:
+ +free_jh:
         if (ret && bh->b_private) {
                 reiserfs_free_jh(bh);
         }
@@@ -3222,7 -3028,7 +3222,7 @@@ static void reiserfs_invalidatepage(str
                 ret = try_to_release_page(page, 0);
                 /* maybe should BUG_ON(!ret); - neilb */
         }
- -      out:
+ +out:
         return;
   }
   
@@@ -3274,20 -3080,18 +3274,20 @@@ static int reiserfs_releasepage(struct 
         return ret;
   }
   
- -/* We thank Mingming Cao for helping us understand in great detail what
- -   to do in this section of the code. */
+ +/*
+ + * We thank Mingming Cao for helping us understand in great detail what
+ + * to do in this section of the code.
+ + */
   static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
-                                 const struct iovec *iov, loff_t offset,
-                                 unsigned long nr_segs)
+                                 struct iov_iter *iter, loff_t offset)
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
+       size_t count = iov_iter_count(iter);
         ssize_t ret;
   
-       ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-                                 reiserfs_get_blocks_direct_io);
+       ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
+                                reiserfs_get_blocks_direct_io);
   
         /*
          * In case of error extending write may have instantiated a few
@@@ -3295,7 -3099,7 +3295,7 @@@
          */
         if (unlikely((rw & WRITE) && ret < 0)) {
                 loff_t isize = i_size_read(inode);
-               loff_t end = offset + iov_length(iov, nr_segs);
+               loff_t end = offset + count;
   
                 if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
                         truncate_setsize(inode, isize);
@@@ -3323,9 -3127,8 +3323,9 @@@ int reiserfs_setattr(struct dentry *den
                 dquot_initialize(inode);
         reiserfs_write_lock(inode->i_sb);
         if (attr->ia_valid & ATTR_SIZE) {
- -              /* version 2 items will be caught by the s_maxbytes check
- -               ** done for us in vmtruncate
+ +              /*
+ +               * version 2 items will be caught by the s_maxbytes check
+ +               * done for us in vmtruncate
                  */
                 if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
                     attr->ia_size > MAX_NON_LFS) {
@@@ -3346,7 -3149,7 +3346,7 @@@
                                 err = journal_begin(&th, inode->i_sb, 4);
                                 if (!err) {
                                         reiserfs_discard_prealloc(&th, inode);
- -                                      err = journal_end(&th, inode->i_sb, 4);
+ +                                      err = journal_end(&th);
                                 }
                                 if (err)
                                         error = err;
@@@ -3386,10 -3189,7 +3386,10 @@@
                 if (error)
                         return error;
   
- -              /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
+ +              /*
+ +               * (user+group)*(old+new) structure - we count quota
+ +               * info and , inode write (sb, inode)
+ +               */
                 reiserfs_write_lock(inode->i_sb);
                 error = journal_begin(&th, inode->i_sb, jbegin_count);
                 reiserfs_write_unlock(inode->i_sb);
@@@ -3398,21 -3198,19 +3398,21 @@@
                 error = dquot_transfer(inode, attr);
                 reiserfs_write_lock(inode->i_sb);
                 if (error) {
- -                      journal_end(&th, inode->i_sb, jbegin_count);
+ +                      journal_end(&th);
                         reiserfs_write_unlock(inode->i_sb);
                         goto out;
                 }
   
- -              /* Update corresponding info in inode so that everything is in
- -               * one transaction */
+ +              /*
+ +               * Update corresponding info in inode so that everything
+ +               * is in one transaction
+ +               */
                 if (attr->ia_valid & ATTR_UID)
                         inode->i_uid = attr->ia_uid;
                 if (attr->ia_valid & ATTR_GID)
                         inode->i_gid = attr->ia_gid;
                 mark_inode_dirty(inode);
- -              error = journal_end(&th, inode->i_sb, jbegin_count);
+ +              error = journal_end(&th);
                 reiserfs_write_unlock(inode->i_sb);
                 if (error)
                         goto out;
@@@ -3422,14 -3220,8 +3422,14 @@@
             attr->ia_size != i_size_read(inode)) {
                 error = inode_newsize_ok(inode, attr->ia_size);
                 if (!error) {
+ +                      /*
+ +                       * Could race against reiserfs_file_release
+ +                       * if called from NFS, so take tailpack mutex.
+ +                       */
+ +                      mutex_lock(&REISERFS_I(inode)->tailpack);
                         truncate_setsize(inode, attr->ia_size);
- -                      reiserfs_vfs_truncate_file(inode);
+ +                      reiserfs_truncate_file(inode, 1);
+ +                      mutex_unlock(&REISERFS_I(inode)->tailpack);
                 }
         }
   
diff --combined fs/ubifs/file.c

index 0ab7f7dfb98b632818a9b1dde1e74f4799633b8b,0888502a60415223ba9285447c9e2f4425bc4076..b5b593c4527005ba50fe0745f2651095dba79331
--- 1/fs/ubifs/file.c
--- 2/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@@ -903,9 -903,8 +903,9 @@@ static int do_writepage(struct page *pa
         struct ubifs_info *c = inode->i_sb->s_fs_info;
   
   #ifdef UBIFS_DEBUG
+ +      struct ubifs_inode *ui = ubifs_inode(inode);
         spin_lock(&ui->ui_lock);
- -      ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE);
+ +      ubifs_assert(page->index <= ui->synced_i_size >> PAGE_CACHE_SHIFT);
         spin_unlock(&ui->ui_lock);
   #endif
   
@@@ -1364,17 -1363,17 +1364,17 @@@ static inline int mctime_update_needed(
   
   /**
    * update_ctime - update mtime and ctime of an inode.
-  * @c: UBIFS file-system description object
    * @inode: inode to update
    *
    * This function updates mtime and ctime of the inode if it is not equivalent to
    * current time. Returns zero in case of success and a negative error code in
    * case of failure.
    */
- static int update_mctime(struct ubifs_info *c, struct inode *inode)
+ static int update_mctime(struct inode *inode)
   {
         struct timespec now = ubifs_current_time(inode);
         struct ubifs_inode *ui = ubifs_inode(inode);
+       struct ubifs_info *c = inode->i_sb->s_fs_info;
   
         if (mctime_update_needed(inode, &now)) {
                 int err, release;
@@@ -1397,18 -1396,13 +1397,13 @@@
         return 0;
   }
   
- static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                              unsigned long nr_segs, loff_t pos)
+ static ssize_t ubifs_write_iter(struct kiocb *iocb, struct iov_iter *from)
   {
-       int err;
-       struct inode *inode = iocb->ki_filp->f_mapping->host;
-       struct ubifs_info *c = inode->i_sb->s_fs_info;
- 
-       err = update_mctime(c, inode);
+       int err = update_mctime(file_inode(iocb->ki_filp));
         if (err)
                 return err;
   
-       return generic_file_aio_write(iocb, iov, nr_segs, pos);
+       return generic_file_write_iter(iocb, from);
   }
   
   static int ubifs_set_page_dirty(struct page *page)
@@@ -1526,7 -1520,8 +1521,7 @@@ static int ubifs_vm_page_mkwrite(struc
         }
   
         wait_for_stable_page(page);
- -      unlock_page(page);
- -      return 0;
+ +      return VM_FAULT_LOCKED;
   
   out_unlock:
         unlock_page(page);
@@@ -1582,15 -1577,15 +1577,15 @@@ const struct inode_operations ubifs_sym
   
   const struct file_operations ubifs_file_operations = {
         .llseek         = generic_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = generic_file_aio_read,
-       .aio_write      = ubifs_aio_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = generic_file_read_iter,
+       .write_iter     = ubifs_write_iter,
         .mmap           = ubifs_file_mmap,
         .fsync          = ubifs_fsync,
         .unlocked_ioctl = ubifs_ioctl,
         .splice_read    = generic_file_splice_read,
-       .splice_write   = generic_file_splice_write,
+       .splice_write   = iter_file_splice_write,
   #ifdef CONFIG_COMPAT
         .compat_ioctl   = ubifs_compat_ioctl,
   #endif
diff --combined fs/xfs/xfs_aops.c

index e32640eedea6430759310ab7b73ce909ab3bc445,08d13e3952524fdfef6fea570955e51eec3824e4..faaf716e2080ad5d41cd86dd05c1ac8f4e3e2fad
--- 1/fs/xfs/xfs_aops.c
--- 2/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@@ -975,39 -975,14 +975,39 @@@ xfs_vm_writepage
          * Given that we do not allow direct reclaim to call us, we should
          * never be called while in a filesystem transaction.
          */
- -      if (WARN_ON(current->flags & PF_FSTRANS))
+ +      if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
                 goto redirty;
   
         /* Is this page beyond the end of the file? */
         offset = i_size_read(inode);
         end_index = offset >> PAGE_CACHE_SHIFT;
         last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
- -      if (page->index >= end_index) {
+ +
+ +      /*
+ +       * The page index is less than the end_index, adjust the end_offset
+ +       * to the highest offset that this page should represent.
+ +       * -----------------------------------------------------
+ +       * |                    file mapping           | <EOF> |
+ +       * -----------------------------------------------------
+ +       * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
+ +       * ^--------------------------------^----------|--------
+ +       * |     desired writeback range    |      see else    |
+ +       * ---------------------------------^------------------|
+ +       */
+ +      if (page->index < end_index)
+ +              end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
+ +      else {
+ +              /*
+ +               * Check whether the page to write out is beyond or straddles
+ +               * i_size or not.
+ +               * -------------------------------------------------------
+ +               * |            file mapping                    | <EOF>  |
+ +               * -------------------------------------------------------
+ +               * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
+ +               * ^--------------------------------^-----------|---------
+ +               * |                                |      Straddles     |
+ +               * ---------------------------------^-----------|--------|
+ +               */
                 unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
   
                 /*
@@@ -1015,36 -990,24 +1015,36 @@@
                  * truncate operation that is in progress. We must redirty the
                  * page so that reclaim stops reclaiming it. Otherwise
                  * xfs_vm_releasepage() is called on it and gets confused.
+ +               *
+ +               * Note that the end_index is unsigned long, it would overflow
+ +               * if the given offset is greater than 16TB on 32-bit system
+ +               * and if we do check the page is fully outside i_size or not
+ +               * via "if (page->index >= end_index + 1)" as "end_index + 1"
+ +               * will be evaluated to 0.  Hence this page will be redirtied
+ +               * and be written out repeatedly which would result in an
+ +               * infinite loop, the user program that perform this operation
+ +               * will hang.  Instead, we can verify this situation by checking
+ +               * if the page to write is totally beyond the i_size or if it's
+ +               * offset is just equal to the EOF.
                  */
- -              if (page->index >= end_index + 1 || offset_into_page == 0)
+ +              if (page->index > end_index ||
+ +                  (page->index == end_index && offset_into_page == 0))
                         goto redirty;
   
                 /*
                  * The page straddles i_size.  It must be zeroed out on each
                  * and every writepage invocation because it may be mmapped.
                  * "A file is mapped in multiples of the page size.  For a file
- -               * that is not a multiple of the  page size, the remaining
+ +               * that is not a multiple of the page size, the remaining
                  * memory is zeroed when mapped, and writes to that region are
                  * not written out to the file."
                  */
                 zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE);
+ +
+ +              /* Adjust the end_offset to the end of file */
+ +              end_offset = offset;
         }
   
- -      end_offset = min_t(unsigned long long,
- -                      (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
- -                      offset);
         len = 1 << inode->i_blkbits;
   
         bh = head = page_buffers(page);
@@@ -1225,9 -1188,9 +1225,9 @@@ xfs_vm_releasepage
   
         xfs_count_page_state(page, &delalloc, &unwritten);
   
- -      if (WARN_ON(delalloc))
+ +      if (WARN_ON_ONCE(delalloc))
                 return 0;
- -      if (WARN_ON(unwritten))
+ +      if (WARN_ON_ONCE(unwritten))
                 return 0;
   
         return try_to_free_buffers(page);
@@@ -1486,9 -1449,8 +1486,8 @@@ STATIC ssize_
   xfs_vm_direct_IO(
         int                     rw,
         struct kiocb            *iocb,
-       const struct iovec      *iov,
-       loff_t                  offset,
-       unsigned long           nr_segs)
+       struct iov_iter         *iter,
+       loff_t                  offset)
   {
         struct inode            *inode = iocb->ki_filp->f_mapping->host;
         struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
@@@ -1496,7 -1458,7 +1495,7 @@@
         ssize_t                 ret;
   
         if (rw & WRITE) {
-               size_t size = iov_length(iov, nr_segs);
+               size_t size = iov_iter_count(iter);
   
                 /*
                  * We cannot preallocate a size update transaction here as we
@@@ -1508,17 -1470,15 +1507,15 @@@
                 if (offset + size > XFS_I(inode)->i_d.di_size)
                         ioend->io_isdirect = 1;
   
-               ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
-                                           offset, nr_segs,
-                                           xfs_get_blocks_direct,
+               ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
+                                           offset, xfs_get_blocks_direct,
                                             xfs_end_io_direct_write, NULL,
                                             DIO_ASYNC_EXTEND);
                 if (ret != -EIOCBQUEUED && iocb->private)
                         goto out_destroy_ioend;
         } else {
-               ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
-                                           offset, nr_segs,
-                                           xfs_get_blocks_direct,
+               ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
+                                           offset, xfs_get_blocks_direct,
                                             NULL, NULL, 0);
         }
   
diff --combined fs/xfs/xfs_file.c

index 1b8160dc04d120326de6bf39634073b9b7d7e98f,500c3f0656d0a27676955c7cfc757291fbee3d5d..1f66779d7a46628cf3a068dd5c08b36368fb6545
--- 1/fs/xfs/xfs_file.c
--- 2/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@@ -229,34 -229,27 +229,27 @@@ xfs_file_fsync
   }
   
   STATIC ssize_t
- xfs_file_aio_read(
+ xfs_file_read_iter(
         struct kiocb            *iocb,
-       const struct iovec      *iovp,
-       unsigned long           nr_segs,
-       loff_t                  pos)
+       struct iov_iter         *to)
   {
         struct file             *file = iocb->ki_filp;
         struct inode            *inode = file->f_mapping->host;
         struct xfs_inode        *ip = XFS_I(inode);
         struct xfs_mount        *mp = ip->i_mount;
-       size_t                  size = 0;
+       size_t                  size = iov_iter_count(to);
         ssize_t                 ret = 0;
         int                     ioflags = 0;
         xfs_fsize_t             n;
+       loff_t                  pos = iocb->ki_pos;
   
         XFS_STATS_INC(xs_read_calls);
   
-       BUG_ON(iocb->ki_pos != pos);
- 
         if (unlikely(file->f_flags & O_DIRECT))
                 ioflags |= IO_ISDIRECT;
         if (file->f_mode & FMODE_NOCMTIME)
                 ioflags |= IO_INVIS;
   
-       ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE);
-       if (ret < 0)
-               return ret;
- 
         if (unlikely(ioflags & IO_ISDIRECT)) {
                 xfs_buftarg_t   *target =
                         XFS_IS_REALTIME_INODE(ip) ?
@@@ -309,7 -302,7 +302,7 @@@
   
         trace_xfs_file_read(ip, size, pos, ioflags);
   
-       ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
+       ret = generic_file_read_iter(iocb, to);
         if (ret > 0)
                 XFS_STATS_ADD(xs_read_bytes, ret);
   
@@@ -349,47 -342,6 +342,6 @@@ xfs_file_splice_read
         return ret;
   }
   
- /*
-  * xfs_file_splice_write() does not use xfs_rw_ilock() because
-  * generic_file_splice_write() takes the i_mutex itself. This, in theory,
-  * couuld cause lock inversions between the aio_write path and the splice path
-  * if someone is doing concurrent splice(2) based writes and write(2) based
-  * writes to the same inode. The only real way to fix this is to re-implement
-  * the generic code here with correct locking orders.
-  */
- STATIC ssize_t
- xfs_file_splice_write(
-       struct pipe_inode_info  *pipe,
-       struct file             *outfilp,
-       loff_t                  *ppos,
-       size_t                  count,
-       unsigned int            flags)
- {
-       struct inode            *inode = outfilp->f_mapping->host;
-       struct xfs_inode        *ip = XFS_I(inode);
-       int                     ioflags = 0;
-       ssize_t                 ret;
- 
-       XFS_STATS_INC(xs_write_calls);
- 
-       if (outfilp->f_mode & FMODE_NOCMTIME)
-               ioflags |= IO_INVIS;
- 
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-               return -EIO;
- 
-       xfs_ilock(ip, XFS_IOLOCK_EXCL);
- 
-       trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
- 
-       ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
-       if (ret > 0)
-               XFS_STATS_ADD(xs_write_bytes, ret);
- 
-       xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-       return ret;
- }
- 
   /*
    * This routine is called to handle zeroing any space in the last block of the
    * file that is beyond the EOF.  We do this since the size is being increased
@@@ -625,10 -577,7 +577,7 @@@ restart
   STATIC ssize_t
   xfs_file_dio_aio_write(
         struct kiocb            *iocb,
-       const struct iovec      *iovp,
-       unsigned long           nr_segs,
-       loff_t                  pos,
-       size_t                  ocount)
+       struct iov_iter         *from)
   {
         struct file             *file = iocb->ki_filp;
         struct address_space    *mapping = file->f_mapping;
@@@ -636,9 -585,10 +585,10 @@@
         struct xfs_inode        *ip = XFS_I(inode);
         struct xfs_mount        *mp = ip->i_mount;
         ssize_t                 ret = 0;
-       size_t                  count = ocount;
         int                     unaligned_io = 0;
         int                     iolock;
+       size_t                  count = iov_iter_count(from);
+       loff_t                  pos = iocb->ki_pos;
         struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
                                         mp->m_rtdev_targp : mp->m_ddev_targp;
   
@@@ -677,6 -627,7 +627,7 @@@
         ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
         if (ret)
                 goto out;
+       iov_iter_truncate(from, count);
   
         if (mapping->nrpages) {
                 ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
@@@ -698,8 -649,7 +649,7 @@@
         }
   
         trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-       ret = generic_file_direct_write(iocb, iovp,
-                       &nr_segs, pos, count, ocount);
+       ret = generic_file_direct_write(iocb, from, pos);
   
   out:
         xfs_rw_iunlock(ip, iolock);
@@@ -712,10 -662,7 +662,7 @@@
   STATIC ssize_t
   xfs_file_buffered_aio_write(
         struct kiocb            *iocb,
-       const struct iovec      *iovp,
-       unsigned long           nr_segs,
-       loff_t                  pos,
-       size_t                  count)
+       struct iov_iter         *from)
   {
         struct file             *file = iocb->ki_filp;
         struct address_space    *mapping = file->f_mapping;
@@@ -724,7 -671,8 +671,8 @@@
         ssize_t                 ret;
         int                     enospc = 0;
         int                     iolock = XFS_IOLOCK_EXCL;
-       struct iov_iter         from;
+       loff_t                  pos = iocb->ki_pos;
+       size_t                  count = iov_iter_count(from);
   
         xfs_rw_ilock(ip, iolock);
   
@@@ -732,13 -680,13 +680,13 @@@
         if (ret)
                 goto out;
   
-       iov_iter_init(&from, iovp, nr_segs, count, 0);
+       iov_iter_truncate(from, count);
         /* We can write back this queue in page reclaim */
         current->backing_dev_info = mapping->backing_dev_info;
   
   write_retry:
         trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
-       ret = generic_perform_write(file, &from, pos);
+       ret = generic_perform_write(file, from, pos);
         if (likely(ret >= 0))
                 iocb->ki_pos = pos + ret;
         /*
@@@ -759,40 -707,29 +707,29 @@@ out
   }
   
   STATIC ssize_t
- xfs_file_aio_write(
+ xfs_file_write_iter(
         struct kiocb            *iocb,
-       const struct iovec      *iovp,
-       unsigned long           nr_segs,
-       loff_t                  pos)
+       struct iov_iter         *from)
   {
         struct file             *file = iocb->ki_filp;
         struct address_space    *mapping = file->f_mapping;
         struct inode            *inode = mapping->host;
         struct xfs_inode        *ip = XFS_I(inode);
         ssize_t                 ret;
-       size_t                  ocount = 0;
+       size_t                  ocount = iov_iter_count(from);
   
         XFS_STATS_INC(xs_write_calls);
   
-       BUG_ON(iocb->ki_pos != pos);
- 
-       ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
-       if (ret)
-               return ret;
- 
         if (ocount == 0)
                 return 0;
   
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-               ret = -EIO;
-               goto out;
-       }
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+               return -EIO;
   
         if (unlikely(file->f_flags & O_DIRECT))
-               ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
+               ret = xfs_file_dio_aio_write(iocb, from);
         else
-               ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
-                                                 ocount);
+               ret = xfs_file_buffered_aio_write(iocb, from);
   
         if (ret > 0) {
                 ssize_t err;
@@@ -804,8 -741,6 +741,6 @@@
                 if (err < 0)
                         ret = err;
         }
- 
- out:
         return ret;
   }
   
@@@ -944,7 -879,7 +879,7 @@@ xfs_dir_open
          */
         mode = xfs_ilock_data_map_shared(ip);
         if (ip->i_d.di_nextents > 0)
- -              xfs_dir3_data_readahead(NULL, ip, 0, -1);
+ +              xfs_dir3_data_readahead(ip, 0, -1);
         xfs_iunlock(ip, mode);
         return 0;
   }
@@@ -1461,12 -1396,12 +1396,12 @@@ xfs_file_llseek
   
   const struct file_operations xfs_file_operations = {
         .llseek         = xfs_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = xfs_file_aio_read,
-       .aio_write      = xfs_file_aio_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = xfs_file_read_iter,
+       .write_iter     = xfs_file_write_iter,
         .splice_read    = xfs_file_splice_read,
-       .splice_write   = xfs_file_splice_write,
+       .splice_write   = iter_file_splice_write,
         .unlocked_ioctl = xfs_file_ioctl,
   #ifdef CONFIG_COMPAT
         .compat_ioctl   = xfs_file_compat_ioctl,
diff --combined fs/xfs/xfs_trace.h

index 6910458915cfea9133cc3c39ff56e5f7f775c065,53182f97cf011e62e4806ce4bf3dbdfc71f00835..152f82782630222321bcd234b20c0ffb0a626e34
--- 1/fs/xfs/xfs_trace.h
--- 2/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@@ -538,64 -538,6 +538,64 @@@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_r
   DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
   DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
   
+ +DECLARE_EVENT_CLASS(xfs_filestream_class,
+ +      TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno),
+ +      TP_ARGS(ip, agno),
+ +      TP_STRUCT__entry(
+ +              __field(dev_t, dev)
+ +              __field(xfs_ino_t, ino)
+ +              __field(xfs_agnumber_t, agno)
+ +              __field(int, streams)
+ +      ),
+ +      TP_fast_assign(
+ +              __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ +              __entry->ino = ip->i_ino;
+ +              __entry->agno = agno;
+ +              __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno);
+ +      ),
+ +      TP_printk("dev %d:%d ino 0x%llx agno %u streams %d",
+ +                MAJOR(__entry->dev), MINOR(__entry->dev),
+ +                __entry->ino,
+ +                __entry->agno,
+ +                __entry->streams)
+ +)
+ +#define DEFINE_FILESTREAM_EVENT(name) \
+ +DEFINE_EVENT(xfs_filestream_class, name, \
+ +      TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), \
+ +      TP_ARGS(ip, agno))
+ +DEFINE_FILESTREAM_EVENT(xfs_filestream_free);
+ +DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup);
+ +DEFINE_FILESTREAM_EVENT(xfs_filestream_scan);
+ +
+ +TRACE_EVENT(xfs_filestream_pick,
+ +      TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno,
+ +               xfs_extlen_t free, int nscan),
+ +      TP_ARGS(ip, agno, free, nscan),
+ +      TP_STRUCT__entry(
+ +              __field(dev_t, dev)
+ +              __field(xfs_ino_t, ino)
+ +              __field(xfs_agnumber_t, agno)
+ +              __field(int, streams)
+ +              __field(xfs_extlen_t, free)
+ +              __field(int, nscan)
+ +      ),
+ +      TP_fast_assign(
+ +              __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ +              __entry->ino = ip->i_ino;
+ +              __entry->agno = agno;
+ +              __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno);
+ +              __entry->free = free;
+ +              __entry->nscan = nscan;
+ +      ),
+ +      TP_printk("dev %d:%d ino 0x%llx agno %u streams %d free %d nscan %d",
+ +                MAJOR(__entry->dev), MINOR(__entry->dev),
+ +                __entry->ino,
+ +                __entry->agno,
+ +                __entry->streams,
+ +                __entry->free,
+ +                __entry->nscan)
+ +);
+ +
   DECLARE_EVENT_CLASS(xfs_lock_class,
         TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
                  unsigned long caller_ip),
@@@ -1118,7 -1060,6 +1118,6 @@@ DEFINE_RW_EVENT(xfs_file_read)
   DEFINE_RW_EVENT(xfs_file_buffered_write);
   DEFINE_RW_EVENT(xfs_file_direct_write);
   DEFINE_RW_EVENT(xfs_file_splice_read);
- DEFINE_RW_EVENT(xfs_file_splice_write);
   
   DECLARE_EVENT_CLASS(xfs_page_class,
         TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
diff --combined include/linux/blk_types.h

index d8e4cea23a257c1b9c8b1514493c407b13d912e0,86df13b97160eb9b4bd8ede7e92158fd7f0825e4..66c2167f04a9d5788b68e97f446757bad082d780
--- 1/include/linux/blk_types.h
--- 2/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@@ -5,8 -5,6 +5,6 @@@
   #ifndef __LINUX_BLK_TYPES_H
   #define __LINUX_BLK_TYPES_H
   
- #ifdef CONFIG_BLOCK
- 
   #include <linux/types.h>
   
   struct bio_set;
@@@ -28,6 -26,8 +26,8 @@@ struct bio_vec 
         unsigned int    bv_offset;
   };
   
+ #ifdef CONFIG_BLOCK
+ 
   struct bvec_iter {
         sector_t                bi_sector;      /* device address in 512 byte
                                                    sectors */
@@@ -190,7 -190,6 +190,7 @@@ enum rq_flag_bits 
         __REQ_PM,               /* runtime pm request */
         __REQ_END,              /* last of chain of requests */
         __REQ_HASHED,           /* on IO scheduler merge hash */
+ +      __REQ_MQ_INFLIGHT,      /* track inflight for MQ */
         __REQ_NR_BITS,          /* stops here */
   };
   
@@@ -244,6 -243,5 +244,6 @@@
   #define REQ_PM                        (1ULL << __REQ_PM)
   #define REQ_END                       (1ULL << __REQ_END)
   #define REQ_HASHED            (1ULL << __REQ_HASHED)
+ +#define REQ_MQ_INFLIGHT               (1ULL << __REQ_MQ_INFLIGHT)
   
   #endif /* __LINUX_BLK_TYPES_H */
diff --combined include/linux/fs.h

index c3f46e499dd0027eed7f2bd0a0bc3cd465c3ac44,4e92d551518d89d61f763775f5416bfc520e7630..338e6f758c6d922be7d8163361da051efa0e3cbc
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -128,6 -128,10 +128,10 @@@ typedef void (dio_iodone_t)(struct kioc
   #define FMODE_ATOMIC_POS      ((__force fmode_t)0x8000)
   /* Write access to underlying fs */
   #define FMODE_WRITER          ((__force fmode_t)0x10000)
+ /* Has read method(s) */
+ #define FMODE_CAN_READ          ((__force fmode_t)0x20000)
+ /* Has write method(s) */
+ #define FMODE_CAN_WRITE         ((__force fmode_t)0x40000)
   
   /* File was opened by fanotify and shouldn't generate fanotify events */
   #define FMODE_NONOTIFY                ((__force fmode_t)0x1000000)
@@@ -343,8 -347,7 +347,7 @@@ struct address_space_operations 
         void (*invalidatepage) (struct page *, unsigned int, unsigned int);
         int (*releasepage) (struct page *, gfp_t);
         void (*freepage)(struct page *);
-       ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
-                       loff_t offset, unsigned long nr_segs);
+       ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
         int (*get_xip_mem)(struct address_space *, pgoff_t, int,
                                                 void **, unsigned long *);
         /*
@@@ -1448,6 -1451,8 +1451,8 @@@ struct block_device_operations
   #define HAVE_COMPAT_IOCTL 1
   #define HAVE_UNLOCKED_IOCTL 1
   
+ struct iov_iter;
+ 
   struct file_operations {
         struct module *owner;
         loff_t (*llseek) (struct file *, loff_t, int);
@@@ -1455,6 -1460,8 +1460,8 @@@
         ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
         ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
         ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+       ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
+       ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
         int (*iterate) (struct file *, struct dir_context *);
         unsigned int (*poll) (struct file *, struct poll_table_struct *);
         long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
@@@ -2404,20 -2411,18 +2411,18 @@@ extern int generic_file_readonly_mmap(s
   extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
                 unsigned long size, pgoff_t pgoff);
   int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
- extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
- extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long);
- extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
- extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
-               unsigned long *, loff_t, size_t, size_t);
+ extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
+ extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
+ extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
+ extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *, loff_t);
   extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
   extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
   extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
- extern int generic_segment_checks(const struct iovec *iov,
-               unsigned long *nr_segs, size_t *count, int access_flags);
+ extern ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
+ extern ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
   
   /* fs/block_dev.c */
- extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                               unsigned long nr_segs, loff_t pos);
+ extern ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from);
   extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
                         int datasync);
   extern void block_sync_page(struct page *page);
@@@ -2427,7 -2432,7 +2432,7 @@@ extern ssize_t generic_file_splice_read
                 struct pipe_inode_info *, size_t, unsigned int);
   extern ssize_t default_file_splice_read(struct file *, loff_t *,
                 struct pipe_inode_info *, size_t, unsigned int);
- extern ssize_t generic_file_splice_write(struct pipe_inode_info *,
+ extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
                 struct file *, loff_t *, size_t, unsigned int);
   extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
                 struct file *out, loff_t *, size_t len, unsigned int flags);
@@@ -2477,16 -2482,16 +2482,16 @@@ enum 
   void dio_end_io(struct bio *bio, int error);
   
   ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
-       struct block_device *bdev, const struct iovec *iov, loff_t offset,
-       unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+       struct block_device *bdev, struct iov_iter *iter, loff_t offset,
+       get_block_t get_block, dio_iodone_t end_io,
         dio_submit_t submit_io, int flags);
   
   static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
-               struct inode *inode, const struct iovec *iov, loff_t offset,
-               unsigned long nr_segs, get_block_t get_block)
+               struct inode *inode, struct iov_iter *iter, loff_t offset,
+               get_block_t get_block)
   {
-       return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-                                   offset, nr_segs, get_block, NULL, NULL,
+       return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iter,
+                                   offset, get_block, NULL, NULL,
                                     DIO_LOCKING | DIO_SKIP_HOLES);
   }
   #endif
@@@ -2590,7 -2595,6 +2595,7 @@@ extern ssize_t simple_read_from_buffer(
   extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
                 const void __user *from, size_t count);
   
+ +extern int __generic_file_fsync(struct file *, loff_t, loff_t, int);
   extern int generic_file_fsync(struct file *, loff_t, loff_t, int);
   
   extern int generic_check_addressable(unsigned, u64);
diff --combined include/linux/nfs_fs.h

index 919576b8e2cfd612d5a2b852f1aa9674811585d1,0a82b6fbae8a4de63683877fce383ae1dee4fcfa..e30f6059ecd642b44c0cc599344c0421b713958f
--- 1/include/linux/nfs_fs.h
--- 2/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@@ -459,13 -459,12 +459,12 @@@ extern int nfs3_removexattr (struct den
   /*
    * linux/fs/nfs/direct.c
    */
- extern ssize_t nfs_direct_IO(int, struct kiocb *, const struct iovec *, loff_t,
-                       unsigned long);
+ extern ssize_t nfs_direct_IO(int, struct kiocb *, struct iov_iter *, loff_t);
   extern ssize_t nfs_file_direct_read(struct kiocb *iocb,
-                       const struct iovec *iov, unsigned long nr_segs,
+                       struct iov_iter *iter,
                         loff_t pos, bool uio);
   extern ssize_t nfs_file_direct_write(struct kiocb *iocb,
-                       const struct iovec *iov, unsigned long nr_segs,
+                       struct iov_iter *iter,
                         loff_t pos, bool uio);
   
   /*
@@@ -520,6 -519,7 +519,6 @@@ extern int  nfs_writepage(struct page *
   extern int  nfs_writepages(struct address_space *, struct writeback_control *);
   extern int  nfs_flush_incompatible(struct file *file, struct page *page);
   extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
- -extern void nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
   
   /*
    * Try to write back everything synchronously (but check the
@@@ -552,6 -552,7 +551,6 @@@ nfs_have_writebacks(struct inode *inode
   extern int  nfs_readpage(struct file *, struct page *);
   extern int  nfs_readpages(struct file *, struct address_space *,
                 struct list_head *, unsigned);
- -extern int  nfs_readpage_result(struct rpc_task *, struct nfs_read_data *);
   extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
                                struct page *);
   
diff --combined mm/filemap.c

index 7fadf1c6283844f07727a68fe12ce5f554f2fff6,7499ef19f1c15f4237b695c23d71414eecd97d3a..dafb06f70a09dd97b1fa690969a638f596714091
--- 1/mm/filemap.c
--- 2/mm/filemap.c
+++ b/mm/filemap.c
@@@ -742,7 -742,7 +742,7 @@@ void unlock_page(struct page *page
   {
         VM_BUG_ON_PAGE(!PageLocked(page), page);
         clear_bit_unlock(PG_locked, &page->flags);
- -      smp_mb__after_clear_bit();
+ +      smp_mb__after_atomic();
         wake_up_page(page, PG_locked);
   }
   EXPORT_SYMBOL(unlock_page);
@@@ -753,51 -753,17 +753,51 @@@
    */
   void end_page_writeback(struct page *page)
   {
- -      if (TestClearPageReclaim(page))
+ +      /*
+ +       * TestClearPageReclaim could be used here but it is an atomic
+ +       * operation and overkill in this particular case. Failing to
+ +       * shuffle a page marked for immediate reclaim is too mild to
+ +       * justify taking an atomic operation penalty at the end of
+ +       * ever page writeback.
+ +       */
+ +      if (PageReclaim(page)) {
+ +              ClearPageReclaim(page);
                 rotate_reclaimable_page(page);
+ +      }
   
         if (!test_clear_page_writeback(page))
                 BUG();
   
- -      smp_mb__after_clear_bit();
+ +      smp_mb__after_atomic();
         wake_up_page(page, PG_writeback);
   }
   EXPORT_SYMBOL(end_page_writeback);
   
+ +/*
+ + * After completing I/O on a page, call this routine to update the page
+ + * flags appropriately
+ + */
+ +void page_endio(struct page *page, int rw, int err)
+ +{
+ +      if (rw == READ) {
+ +              if (!err) {
+ +                      SetPageUptodate(page);
+ +              } else {
+ +                      ClearPageUptodate(page);
+ +                      SetPageError(page);
+ +              }
+ +              unlock_page(page);
+ +      } else { /* rw == WRITE */
+ +              if (err) {
+ +                      SetPageError(page);
+ +                      if (page->mapping)
+ +                              mapping_set_error(page->mapping, err);
+ +              }
+ +              end_page_writeback(page);
+ +      }
+ +}
+ +EXPORT_SYMBOL_GPL(page_endio);
+ +
   /**
    * __lock_page - get a lock on the page, assuming we need to sleep to get it
    * @page: the page to lock
@@@ -990,6 -956,26 +990,6 @@@ out
   }
   EXPORT_SYMBOL(find_get_entry);
   
- -/**
- - * find_get_page - find and get a page reference
- - * @mapping: the address_space to search
- - * @offset: the page index
- - *
- - * Looks up the page cache slot at @mapping & @offset.  If there is a
- - * page cache page, it is returned with an increased refcount.
- - *
- - * Otherwise, %NULL is returned.
- - */
- -struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
- -{
- -      struct page *page = find_get_entry(mapping, offset);
- -
- -      if (radix_tree_exceptional_entry(page))
- -              page = NULL;
- -      return page;
- -}
- -EXPORT_SYMBOL(find_get_page);
- -
   /**
    * find_lock_entry - locate, pin and lock a page cache entry
    * @mapping: the address_space to search
@@@ -1027,84 -1013,66 +1027,84 @@@ repeat
   EXPORT_SYMBOL(find_lock_entry);
   
   /**
- - * find_lock_page - locate, pin and lock a pagecache page
+ + * pagecache_get_page - find and get a page reference
    * @mapping: the address_space to search
    * @offset: the page index
+ + * @fgp_flags: PCG flags
+ + * @gfp_mask: gfp mask to use if a page is to be allocated
    *
- - * Looks up the page cache slot at @mapping & @offset.  If there is a
- - * page cache page, it is returned locked and with an increased
- - * refcount.
- - *
- - * Otherwise, %NULL is returned.
- - *
- - * find_lock_page() may sleep.
- - */
- -struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
- -{
- -      struct page *page = find_lock_entry(mapping, offset);
- -
- -      if (radix_tree_exceptional_entry(page))
- -              page = NULL;
- -      return page;
- -}
- -EXPORT_SYMBOL(find_lock_page);
- -
- -/**
- - * find_or_create_page - locate or add a pagecache page
- - * @mapping: the page's address_space
- - * @index: the page's index into the mapping
- - * @gfp_mask: page allocation mode
+ + * Looks up the page cache slot at @mapping & @offset.
    *
- - * Looks up the page cache slot at @mapping & @offset.  If there is a
- - * page cache page, it is returned locked and with an increased
- - * refcount.
+ + * PCG flags modify how the page is returned
    *
- - * If the page is not present, a new page is allocated using @gfp_mask
- - * and added to the page cache and the VM's LRU list.  The page is
- - * returned locked and with an increased refcount.
+ + * FGP_ACCESSED: the page will be marked accessed
+ + * FGP_LOCK: Page is return locked
+ + * FGP_CREAT: If page is not present then a new page is allocated using
+ + *            @gfp_mask and added to the page cache and the VM's LRU
+ + *            list. The page is returned locked and with an increased
+ + *            refcount. Otherwise, %NULL is returned.
    *
- - * On memory exhaustion, %NULL is returned.
+ + * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
+ + * if the GFP flags specified for FGP_CREAT are atomic.
    *
- - * find_or_create_page() may sleep, even if @gfp_flags specifies an
- - * atomic allocation!
+ + * If there is a page cache page, it is returned with an increased refcount.
    */
- -struct page *find_or_create_page(struct address_space *mapping,
- -              pgoff_t index, gfp_t gfp_mask)
+ +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
+ +      int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask)
   {
         struct page *page;
- -      int err;
+ +
   repeat:
- -      page = find_lock_page(mapping, index);
- -      if (!page) {
- -              page = __page_cache_alloc(gfp_mask);
+ +      page = find_get_entry(mapping, offset);
+ +      if (radix_tree_exceptional_entry(page))
+ +              page = NULL;
+ +      if (!page)
+ +              goto no_page;
+ +
+ +      if (fgp_flags & FGP_LOCK) {
+ +              if (fgp_flags & FGP_NOWAIT) {
+ +                      if (!trylock_page(page)) {
+ +                              page_cache_release(page);
+ +                              return NULL;
+ +                      }
+ +              } else {
+ +                      lock_page(page);
+ +              }
+ +
+ +              /* Has the page been truncated? */
+ +              if (unlikely(page->mapping != mapping)) {
+ +                      unlock_page(page);
+ +                      page_cache_release(page);
+ +                      goto repeat;
+ +              }
+ +              VM_BUG_ON_PAGE(page->index != offset, page);
+ +      }
+ +
+ +      if (page && (fgp_flags & FGP_ACCESSED))
+ +              mark_page_accessed(page);
+ +
+ +no_page:
+ +      if (!page && (fgp_flags & FGP_CREAT)) {
+ +              int err;
+ +              if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
+ +                      cache_gfp_mask |= __GFP_WRITE;
+ +              if (fgp_flags & FGP_NOFS) {
+ +                      cache_gfp_mask &= ~__GFP_FS;
+ +                      radix_gfp_mask &= ~__GFP_FS;
+ +              }
+ +
+ +              page = __page_cache_alloc(cache_gfp_mask);
                 if (!page)
                         return NULL;
- -              /*
- -               * We want a regular kernel memory (not highmem or DMA etc)
- -               * allocation for the radix tree nodes, but we need to honour
- -               * the context-specific requirements the caller has asked for.
- -               * GFP_RECLAIM_MASK collects those requirements.
- -               */
- -              err = add_to_page_cache_lru(page, mapping, index,
- -                      (gfp_mask & GFP_RECLAIM_MASK));
+ +
+ +              if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
+ +                      fgp_flags |= FGP_LOCK;
+ +
+ +              /* Init accessed so avoit atomic mark_page_accessed later */
+ +              if (fgp_flags & FGP_ACCESSED)
+ +                      init_page_accessed(page);
+ +
+ +              err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
                 if (unlikely(err)) {
                         page_cache_release(page);
                         page = NULL;
@@@ -1112,10 -1080,9 +1112,10 @@@
                                 goto repeat;
                 }
         }
+ +
         return page;
   }
- -EXPORT_SYMBOL(find_or_create_page);
+ +EXPORT_SYMBOL(pagecache_get_page);
   
   /**
    * find_get_entries - gang pagecache lookup
@@@ -1412,6 -1379,39 +1412,6 @@@ repeat
   }
   EXPORT_SYMBOL(find_get_pages_tag);
   
- -/**
- - * grab_cache_page_nowait - returns locked page at given index in given cache
- - * @mapping: target address_space
- - * @index: the page index
- - *
- - * Same as grab_cache_page(), but do not wait if the page is unavailable.
- - * This is intended for speculative data generators, where the data can
- - * be regenerated if the page couldn't be grabbed.  This routine should
- - * be safe to call while holding the lock for another page.
- - *
- - * Clear __GFP_FS when allocating the page to avoid recursion into the fs
- - * and deadlock against the caller's locked page.
- - */
- -struct page *
- -grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
- -{
- -      struct page *page = find_get_page(mapping, index);
- -
- -      if (page) {
- -              if (trylock_page(page))
- -                      return page;
- -              page_cache_release(page);
- -              return NULL;
- -      }
- -      page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
- -      if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
- -              page_cache_release(page);
- -              page = NULL;
- -      }
- -      return page;
- -}
- -EXPORT_SYMBOL(grab_cache_page_nowait);
- -
   /*
    * CD/DVDs are error prone. When a medium error occurs, the driver may fail
    * a _large_ part of the i/o request. Imagine the worst scenario:
@@@ -1665,96 -1665,42 +1665,42 @@@ out
         return written ? written : error;
   }
   
- /*
-  * Performs necessary checks before doing a write
-  * @iov:      io vector request
-  * @nr_segs:  number of segments in the iovec
-  * @count:    number of bytes to write
-  * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
-  *
-  * Adjust number of segments and amount of bytes to write (nr_segs should be
-  * properly initialized first). Returns appropriate error code that caller
-  * should return or zero in case that write should be allowed.
-  */
- int generic_segment_checks(const struct iovec *iov,
-                       unsigned long *nr_segs, size_t *count, int access_flags)
- {
-       unsigned long   seg;
-       size_t cnt = 0;
-       for (seg = 0; seg < *nr_segs; seg++) {
-               const struct iovec *iv = &iov[seg];
- 
-               /*
-                * If any segment has a negative length, or the cumulative
-                * length ever wraps negative then return -EINVAL.
-                */
-               cnt += iv->iov_len;
-               if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
-                       return -EINVAL;
-               if (access_ok(access_flags, iv->iov_base, iv->iov_len))
-                       continue;
-               if (seg == 0)
-                       return -EFAULT;
-               *nr_segs = seg;
-               cnt -= iv->iov_len;     /* This segment is no good */
-               break;
-       }
-       *count = cnt;
-       return 0;
- }
- EXPORT_SYMBOL(generic_segment_checks);
- 
   /**
-  * generic_file_aio_read - generic filesystem read routine
+  * generic_file_read_iter - generic filesystem read routine
    * @iocb:     kernel I/O control block
-  * @iov:      io vector request
-  * @nr_segs:  number of segments in the iovec
-  * @pos:      current file position
+  * @iter:     destination for the data read
    *
-  * This is the "read()" routine for all filesystems
+  * This is the "read_iter()" routine for all filesystems
    * that can use the page cache directly.
    */
   ssize_t
- generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-               unsigned long nr_segs, loff_t pos)
+ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
   {
-       struct file *filp = iocb->ki_filp;
-       ssize_t retval;
-       size_t count;
+       struct file *file = iocb->ki_filp;
+       ssize_t retval = 0;
         loff_t *ppos = &iocb->ki_pos;
-       struct iov_iter i;
- 
-       count = 0;
-       retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-       if (retval)
-               return retval;
-       iov_iter_init(&i, iov, nr_segs, count, 0);
+       loff_t pos = *ppos;
   
         /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
-       if (filp->f_flags & O_DIRECT) {
+       if (file->f_flags & O_DIRECT) {
+               struct address_space *mapping = file->f_mapping;
+               struct inode *inode = mapping->host;
+               size_t count = iov_iter_count(iter);
                 loff_t size;
-               struct address_space *mapping;
-               struct inode *inode;
   
-               mapping = filp->f_mapping;
-               inode = mapping->host;
                 if (!count)
                         goto out; /* skip atime */
                 size = i_size_read(inode);
                 retval = filemap_write_and_wait_range(mapping, pos,
-                                       pos + iov_length(iov, nr_segs) - 1);
+                                       pos + count - 1);
                 if (!retval) {
-                       retval = mapping->a_ops->direct_IO(READ, iocb,
-                                                          iov, pos, nr_segs);
+                       struct iov_iter data = *iter;
+                       retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos);
                 }
+ 
                 if (retval > 0) {
                         *ppos = pos + retval;
-                       count -= retval;
-                       /*
-                        * If we did a short DIO read we need to skip the
-                        * section of the iov that we've already read data into.
-                        */
-                       iov_iter_advance(&i, retval);
+                       iov_iter_advance(iter, retval);
                 }
   
                 /*
@@@ -1765,17 -1711,17 +1711,17 @@@
                  * and return.  Otherwise fallthrough to buffered io for
                  * the rest of the read.
                  */
-               if (retval < 0 || !count || *ppos >= size) {
-                       file_accessed(filp);
+               if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) {
+                       file_accessed(file);
                         goto out;
                 }
         }
   
-       retval = do_generic_file_read(filp, ppos, &i, retval);
+       retval = do_generic_file_read(file, ppos, iter, retval);
   out:
         return retval;
   }
- EXPORT_SYMBOL(generic_file_aio_read);
+ EXPORT_SYMBOL(generic_file_read_iter);
   
   #ifdef CONFIG_MMU
   /**
@@@ -2381,14 -2327,13 +2327,12 @@@ int pagecache_write_end(struct file *fi
   {
         const struct address_space_operations *aops = mapping->a_ops;
   
- -      mark_page_accessed(page);
         return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
   }
   EXPORT_SYMBOL(pagecache_write_end);
   
   ssize_t
- generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
-               unsigned long *nr_segs, loff_t pos,
-               size_t count, size_t ocount)
+ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
   {
         struct file     *file = iocb->ki_filp;
         struct address_space *mapping = file->f_mapping;
@@@ -2396,11 -2341,9 +2340,9 @@@
         ssize_t         written;
         size_t          write_len;
         pgoff_t         end;
+       struct iov_iter data;
   
-       if (count != ocount)
-               *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
- 
-       write_len = iov_length(iov, *nr_segs);
+       write_len = iov_iter_count(from);
         end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
   
         written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
@@@ -2427,7 -2370,8 +2369,8 @@@
                 }
         }
   
-       written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+       data = *from;
+       written = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos);
   
         /*
          * Finally, try again to invalidate clean pages which might have been
@@@ -2444,6 -2388,7 +2387,7 @@@
   
         if (written > 0) {
                 pos += written;
+               iov_iter_advance(from, written);
                 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
                         i_size_write(inode, pos);
                         mark_inode_dirty(inode);
@@@ -2462,18 -2407,34 +2406,18 @@@ EXPORT_SYMBOL(generic_file_direct_write
   struct page *grab_cache_page_write_begin(struct address_space *mapping,
                                         pgoff_t index, unsigned flags)
   {
- -      int status;
- -      gfp_t gfp_mask;
         struct page *page;
- -      gfp_t gfp_notmask = 0;
+ +      int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT;
   
- -      gfp_mask = mapping_gfp_mask(mapping);
- -      if (mapping_cap_account_dirty(mapping))
- -              gfp_mask |= __GFP_WRITE;
         if (flags & AOP_FLAG_NOFS)
- -              gfp_notmask = __GFP_FS;
- -repeat:
- -      page = find_lock_page(mapping, index);
+ +              fgp_flags |= FGP_NOFS;
+ +
+ +      page = pagecache_get_page(mapping, index, fgp_flags,
+ +                      mapping_gfp_mask(mapping),
+ +                      GFP_KERNEL);
         if (page)
- -              goto found;
+ +              wait_for_stable_page(page);
   
- -      page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
- -      if (!page)
- -              return NULL;
- -      status = add_to_page_cache_lru(page, mapping, index,
- -                                              GFP_KERNEL & ~gfp_notmask);
- -      if (unlikely(status)) {
- -              page_cache_release(page);
- -              if (status == -EEXIST)
- -                      goto repeat;
- -              return NULL;
- -      }
- -found:
- -      wait_for_stable_page(page);
         return page;
   }
   EXPORT_SYMBOL(grab_cache_page_write_begin);
@@@ -2522,7 -2483,7 +2466,7 @@@ again
   
                 status = a_ops->write_begin(file, mapping, pos, bytes, flags,
                                                 &page, &fsdata);
- -              if (unlikely(status))
+ +              if (unlikely(status < 0))
                         break;
   
                 if (mapping_writably_mapped(mapping))
@@@ -2531,6 -2492,7 +2475,6 @@@
                 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
                 flush_dcache_page(page);
   
- -              mark_page_accessed(page);
                 status = a_ops->write_end(file, mapping, pos, bytes, copied,
                                                 page, fsdata);
                 if (unlikely(status < 0))
@@@ -2568,10 -2530,9 +2512,9 @@@
   EXPORT_SYMBOL(generic_perform_write);
   
   /**
-  * __generic_file_aio_write - write data to a file
+  * __generic_file_write_iter - write data to a file
    * @iocb:     IO state structure (file, offset, etc.)
-  * @iov:      vector with data to write
-  * @nr_segs:  number of segments in the vector
+  * @from:     iov_iter with data to write
    *
    * This function does all the work needed for actually writing data to a
    * file. It does all basic checks, removes SUID from the file, updates
@@@ -2585,26 -2546,16 +2528,16 @@@
    * A caller has to handle it. This is mainly due to the fact that we want to
    * avoid syncing under i_mutex.
    */
- ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                                unsigned long nr_segs)
+ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
   {
         struct file *file = iocb->ki_filp;
         struct address_space * mapping = file->f_mapping;
-       size_t ocount;          /* original count */
-       size_t count;           /* after file limit checks */
         struct inode    *inode = mapping->host;
         loff_t          pos = iocb->ki_pos;
         ssize_t         written = 0;
         ssize_t         err;
         ssize_t         status;
-       struct iov_iter from;
- 
-       ocount = 0;
-       err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-       if (err)
-               return err;
- 
-       count = ocount;
+       size_t          count = iov_iter_count(from);
   
         /* We can write back this queue in page reclaim */
         current->backing_dev_info = mapping->backing_dev_info;
@@@ -2615,6 -2566,8 +2548,8 @@@
         if (count == 0)
                 goto out;
   
+       iov_iter_truncate(from, count);
+ 
         err = file_remove_suid(file);
         if (err)
                 goto out;
@@@ -2623,17 -2576,13 +2558,13 @@@
         if (err)
                 goto out;
   
-       iov_iter_init(&from, iov, nr_segs, count, 0);
- 
         /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
         if (unlikely(file->f_flags & O_DIRECT)) {
                 loff_t endbyte;
   
-               written = generic_file_direct_write(iocb, iov, &from.nr_segs, pos,
-                                                       count, ocount);
+               written = generic_file_direct_write(iocb, from, pos);
                 if (written < 0 || written == count)
                         goto out;
-               iov_iter_advance(&from, written);
   
                 /*
                  * direct-io write to a hole: fall through to buffered I/O
@@@ -2642,7 -2591,7 +2573,7 @@@
                 pos += written;
                 count -= written;
   
-               status = generic_perform_write(file, &from, pos);
+               status = generic_perform_write(file, from, pos);
                 /*
                  * If generic_perform_write() returned a synchronous error
                  * then we want to return the number of bytes which were
@@@ -2674,7 -2623,7 +2605,7 @@@
                          */
                 }
         } else {
-               written = generic_perform_write(file, &from, pos);
+               written = generic_perform_write(file, from, pos);
                 if (likely(written >= 0))
                         iocb->ki_pos = pos + written;
         }
@@@ -2682,30 -2631,25 +2613,25 @@@ out
         current->backing_dev_info = NULL;
         return written ? written : err;
   }
- EXPORT_SYMBOL(__generic_file_aio_write);
+ EXPORT_SYMBOL(__generic_file_write_iter);
   
   /**
-  * generic_file_aio_write - write data to a file
+  * generic_file_write_iter - write data to a file
    * @iocb:     IO state structure
-  * @iov:      vector with data to write
-  * @nr_segs:  number of segments in the vector
-  * @pos:      position in file where to write
+  * @from:     iov_iter with data to write
    *
-  * This is a wrapper around __generic_file_aio_write() to be used by most
+  * This is a wrapper around __generic_file_write_iter() to be used by most
    * filesystems. It takes care of syncing the file in case of O_SYNC file
    * and acquires i_mutex as needed.
    */
- ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-               unsigned long nr_segs, loff_t pos)
+ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
         ssize_t ret;
   
-       BUG_ON(iocb->ki_pos != pos);
- 
         mutex_lock(&inode->i_mutex);
-       ret = __generic_file_aio_write(iocb, iov, nr_segs);
+       ret = __generic_file_write_iter(iocb, from);
         mutex_unlock(&inode->i_mutex);
   
         if (ret > 0) {
@@@ -2717,7 -2661,7 +2643,7 @@@
         }
         return ret;
   }
- EXPORT_SYMBOL(generic_file_aio_write);
+ EXPORT_SYMBOL(generic_file_write_iter);
   
   /**
    * try_to_release_page() - release old fs-specific metadata on a page
diff --combined mm/page_io.c

index 58b50d2901fe2a43a916bae15daffb5c8c6f1cbe,33bb38c4aad716b326c299d63a6cb72b74759bbc..243a9b76e5cee9257d499311c21153084980c39a
--- 1/mm/page_io.c
--- 2/mm/page_io.c
+++ b/mm/page_io.c
@@@ -248,25 -248,28 +248,33 @@@ out
         return ret;
   }
   
+ +static sector_t swap_page_sector(struct page *page)
+ +{
+ +      return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9);
+ +}
+ +
   int __swap_writepage(struct page *page, struct writeback_control *wbc,
         void (*end_write_func)(struct bio *, int))
   {
         struct bio *bio;
- -      int ret = 0, rw = WRITE;
+ +      int ret, rw = WRITE;
         struct swap_info_struct *sis = page_swap_info(page);
   
         if (sis->flags & SWP_FILE) {
                 struct kiocb kiocb;
                 struct file *swap_file = sis->swap_file;
                 struct address_space *mapping = swap_file->f_mapping;
-               struct iovec iov = {
-                       .iov_base = kmap(page),
-                       .iov_len  = PAGE_SIZE,
+               struct bio_vec bv = {
+                       .bv_page = page,
+                       .bv_len  = PAGE_SIZE,
+                       .bv_offset = 0
+               };
+               struct iov_iter from = {
+                       .type = ITER_BVEC | WRITE,
+                       .count = PAGE_SIZE,
+                       .iov_offset = 0,
+                       .nr_segs = 1,
+                       .bvec = &bv
                 };
   
                 init_sync_kiocb(&kiocb, swap_file);
@@@ -275,10 -278,9 +283,9 @@@
   
                 set_page_writeback(page);
                 unlock_page(page);
-               ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
-                                               &kiocb, &iov,
-                                               kiocb.ki_pos, 1);
-               kunmap(page);
+               ret = mapping->a_ops->direct_IO(ITER_BVEC | WRITE,
+                                               &kiocb, &from,
+                                               kiocb.ki_pos);
                 if (ret == PAGE_SIZE) {
                         count_vm_event(PSWPOUT);
                         ret = 0;
@@@ -302,13 -304,6 +309,13 @@@
                 return ret;
         }
   
+ +      ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
+ +      if (!ret) {
+ +              count_vm_event(PSWPOUT);
+ +              return 0;
+ +      }
+ +
+ +      ret = 0;
         bio = get_swap_bio(GFP_NOIO, page, end_write_func);
         if (bio == NULL) {
                 set_page_dirty(page);
@@@ -350,13 -345,6 +357,13 @@@ int swap_readpage(struct page *page
                 return ret;
         }
   
+ +      ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
+ +      if (!ret) {
+ +              count_vm_event(PSWPIN);
+ +              return 0;
+ +      }
+ +
+ +      ret = 0;
         bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
         if (bio == NULL) {
                 unlock_page(page);
diff --combined mm/shmem.c

index 5402481c28d190a83718f6b1897eec78df4c0209,de834ab8b6b90a0a406da4c96278ef946702e7ad..f484c276e994923a5c05577b42d5a9dcc58ae7cc
--- 1/mm/shmem.c
--- 2/mm/shmem.c
+++ b/mm/shmem.c
@@@ -1132,7 -1132,7 +1132,7 @@@ repeat
                         goto decused;
                 }
   
- -              SetPageSwapBacked(page);
+ +              __SetPageSwapBacked(page);
                 __set_page_locked(page);
                 error = mem_cgroup_charge_file(page, current->mm,
                                                 gfp & GFP_RECLAIM_MASK);
@@@ -1372,13 -1372,9 +1372,13 @@@ shmem_write_begin(struct file *file, st
                         loff_t pos, unsigned len, unsigned flags,
                         struct page **pagep, void **fsdata)
   {
+ +      int ret;
         struct inode *inode = mapping->host;
         pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- -      return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
+ +      ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
+ +      if (ret == 0 && *pagep)
+ +              init_page_accessed(*pagep);
+ +      return ret;
   }
   
   static int
@@@ -1406,8 -1402,7 +1406,7 @@@ shmem_write_end(struct file *file, stru
         return copied;
   }
   
- static ssize_t shmem_file_aio_read(struct kiocb *iocb,
-               const struct iovec *iov, unsigned long nr_segs, loff_t pos)
+ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file_inode(file);
@@@ -1416,15 -1411,8 +1415,8 @@@
         unsigned long offset;
         enum sgp_type sgp = SGP_READ;
         int error = 0;
-       ssize_t retval;
-       size_t count;
+       ssize_t retval = 0;
         loff_t *ppos = &iocb->ki_pos;
-       struct iov_iter iter;
- 
-       retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-       if (retval)
-               return retval;
-       iov_iter_init(&iter, iov, nr_segs, count, 0);
   
         /*
          * Might this read be for a stacking filesystem?  Then when reading
@@@ -1500,14 -1488,14 +1492,14 @@@
                  * Ok, we have the page, and it's up-to-date, so
                  * now we can copy it to user space...
                  */
-               ret = copy_page_to_iter(page, offset, nr, &iter);
+               ret = copy_page_to_iter(page, offset, nr, to);
                 retval += ret;
                 offset += ret;
                 index += offset >> PAGE_CACHE_SHIFT;
                 offset &= ~PAGE_CACHE_MASK;
   
                 page_cache_release(page);
-               if (!iov_iter_count(&iter))
+               if (!iov_iter_count(to))
                         break;
                 if (ret < nr) {
                         error = -EFAULT;
@@@ -2629,13 -2617,13 +2621,13 @@@ static const struct file_operations shm
         .mmap           = shmem_mmap,
   #ifdef CONFIG_TMPFS
         .llseek         = shmem_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = shmem_file_aio_read,
-       .aio_write      = generic_file_aio_write,
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+       .read_iter      = shmem_file_read_iter,
+       .write_iter     = generic_file_write_iter,
         .fsync          = noop_fsync,
         .splice_read    = shmem_file_splice_read,
-       .splice_write   = generic_file_splice_write,
+       .splice_write   = iter_file_splice_write,
         .fallocate      = shmem_fallocate,
   #endif
   };
diff --combined mm/vmscan.c

index e01ded365440704dbec95f0ec8f56326d646b9c2,9c2dba6ac68541397b5f3c9658c1a898284833cf..0f16ffe8eb67c6fcd0350add4a5a4b6092cb6905
--- 1/mm/vmscan.c
--- 2/mm/vmscan.c
+++ b/mm/vmscan.c
@@@ -11,8 -11,6 +11,8 @@@
    *  Multiqueue VM started 5.8.00, Rik van Riel.
    */
   
+ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+ +
   #include <linux/mm.h>
   #include <linux/module.h>
   #include <linux/gfp.h>
@@@ -45,7 -43,6 +45,7 @@@
   #include <linux/sysctl.h>
   #include <linux/oom.h>
   #include <linux/prefetch.h>
+ +#include <linux/printk.h>
   
   #include <asm/tlbflush.h>
   #include <asm/div64.h>
@@@ -86,9 -83,6 +86,9 @@@ struct scan_control 
         /* Scan (total_size >> priority) pages at once */
         int priority;
   
+ +      /* anon vs. file LRUs scanning "ratio" */
+ +      int swappiness;
+ +
         /*
          * The memory cgroup that hit its limit and as a result is the
          * primary target of this reclaim invocation.
@@@ -330,7 -324,7 +330,7 @@@ shrink_slab_node(struct shrink_control 
         else
                 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
   
- -      trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
+ +      trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
         return freed;
   }
   
@@@ -464,7 -458,7 +464,7 @@@ static pageout_t pageout(struct page *p
          * stalls if we need to run get_block().  We could test
          * PagePrivate for that.
          *
-        * If this process is currently in __generic_file_aio_write() against
+        * If this process is currently in __generic_file_write_iter() against
          * this page's queue, we can perform writeback even if that
          * will block.
          *
@@@ -483,7 -477,7 +483,7 @@@
                 if (page_has_private(page)) {
                         if (try_to_free_buffers(page)) {
                                 ClearPageDirty(page);
- -                              printk("%s: orphaned page\n", __func__);
+ +                              pr_info("%s: orphaned page\n", __func__);
                                 return PAGE_CLEAN;
                         }
                 }
@@@ -1127,7 -1121,7 +1127,7 @@@ keep
                 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
         }
   
- -      free_hot_cold_page_list(&free_pages, 1);
+ +      free_hot_cold_page_list(&free_pages, true);
   
         list_splice(&ret_pages, page_list);
         count_vm_events(PGACTIVATE, pgactivate);
@@@ -1444,19 -1438,6 +1444,19 @@@ putback_inactive_pages(struct lruvec *l
         list_splice(&pages_to_free, page_list);
   }
   
+ +/*
+ + * If a kernel thread (such as nfsd for loop-back mounts) services
+ + * a backing device by writing to the page cache it sets PF_LESS_THROTTLE.
+ + * In that case we should only throttle if the backing device it is
+ + * writing to is congested.  In other cases it is safe to throttle.
+ + */
+ +static int current_may_throttle(void)
+ +{
+ +      return !(current->flags & PF_LESS_THROTTLE) ||
+ +              current->backing_dev_info == NULL ||
+ +              bdi_write_congested(current->backing_dev_info);
+ +}
+ +
   /*
    * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
    * of reclaimed pages
@@@ -1538,7 -1519,7 +1538,7 @@@ shrink_inactive_list(unsigned long nr_t
   
         spin_unlock_irq(&zone->lru_lock);
   
- -      free_hot_cold_page_list(&page_list, 1);
+ +      free_hot_cold_page_list(&page_list, true);
   
         /*
          * If reclaim is isolating dirty pages under writeback, it implies
@@@ -1573,18 -1554,19 +1573,18 @@@
                  * If dirty pages are scanned that are not queued for IO, it
                  * implies that flushers are not keeping up. In this case, flag
                  * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
- -               * pages from reclaim context. It will forcibly stall in the
- -               * next check.
+ +               * pages from reclaim context.
                  */
                 if (nr_unqueued_dirty == nr_taken)
                         zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
   
                 /*
- -               * In addition, if kswapd scans pages marked marked for
- -               * immediate reclaim and under writeback (nr_immediate), it
- -               * implies that pages are cycling through the LRU faster than
+ +               * If kswapd scans pages marked marked for immediate
+ +               * reclaim and under writeback (nr_immediate), it implies
+ +               * that pages are cycling through the LRU faster than
                  * they are written so also forcibly stall.
                  */
- -              if (nr_unqueued_dirty == nr_taken || nr_immediate)
+ +              if (nr_immediate && current_may_throttle())
                         congestion_wait(BLK_RW_ASYNC, HZ/10);
         }
   
@@@ -1593,8 -1575,7 +1593,8 @@@
          * is congested. Allow kswapd to continue until it starts encountering
          * unqueued dirty pages or cycling through the LRU too quickly.
          */
- -      if (!sc->hibernation_mode && !current_is_kswapd())
+ +      if (!sc->hibernation_mode && !current_is_kswapd() &&
+ +          current_may_throttle())
                 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
   
         trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
@@@ -1759,7 -1740,7 +1759,7 @@@ static void shrink_active_list(unsigne
         __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
         spin_unlock_irq(&zone->lru_lock);
   
- -      free_hot_cold_page_list(&l_hold, 1);
+ +      free_hot_cold_page_list(&l_hold, true);
   }
   
   #ifdef CONFIG_SWAP
@@@ -1849,6 -1830,13 +1849,6 @@@ static unsigned long shrink_list(enum l
         return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
   }
   
- -static int vmscan_swappiness(struct scan_control *sc)
- -{
- -      if (global_reclaim(sc))
- -              return vm_swappiness;
- -      return mem_cgroup_swappiness(sc->target_mem_cgroup);
- -}
- -
   enum scan_balance {
         SCAN_EQUAL,
         SCAN_FRACT,
@@@ -1878,8 -1866,6 +1878,8 @@@ static void get_scan_count(struct lruve
         bool force_scan = false;
         unsigned long ap, fp;
         enum lru_list lru;
+ +      bool some_scanned;
+ +      int pass;
   
         /*
          * If the zone or memcg is small, nr[l] can be 0.  This
@@@ -1909,7 -1895,7 +1909,7 @@@
          * using the memory controller's swap limit feature would be
          * too expensive.
          */
- -      if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
+ +      if (!global_reclaim(sc) && !sc->swappiness) {
                 scan_balance = SCAN_FILE;
                 goto out;
         }
@@@ -1919,7 -1905,7 +1919,7 @@@
          * system is close to OOM, scan both anon and file equally
          * (unless the swappiness setting disagrees with swapping).
          */
- -      if (!sc->priority && vmscan_swappiness(sc)) {
+ +      if (!sc->priority && sc->swappiness) {
                 scan_balance = SCAN_EQUAL;
                 goto out;
         }
@@@ -1962,7 -1948,7 +1962,7 @@@
          * With swappiness at 100, anonymous and file have the same priority.
          * This scanning priority is essentially the inverse of IO cost.
          */
- -      anon_prio = vmscan_swappiness(sc);
+ +      anon_prio = sc->swappiness;
         file_prio = 200 - anon_prio;
   
         /*
@@@ -2003,49 -1989,39 +2003,49 @@@
         fraction[1] = fp;
         denominator = ap + fp + 1;
   out:
- -      for_each_evictable_lru(lru) {
- -              int file = is_file_lru(lru);
- -              unsigned long size;
- -              unsigned long scan;
+ +      some_scanned = false;
+ +      /* Only use force_scan on second pass. */
+ +      for (pass = 0; !some_scanned && pass < 2; pass++) {
+ +              for_each_evictable_lru(lru) {
+ +                      int file = is_file_lru(lru);
+ +                      unsigned long size;
+ +                      unsigned long scan;
   
- -              size = get_lru_size(lruvec, lru);
- -              scan = size >> sc->priority;
+ +                      size = get_lru_size(lruvec, lru);
+ +                      scan = size >> sc->priority;
   
- -              if (!scan && force_scan)
- -                      scan = min(size, SWAP_CLUSTER_MAX);
+ +                      if (!scan && pass && force_scan)
+ +                              scan = min(size, SWAP_CLUSTER_MAX);
   
- -              switch (scan_balance) {
- -              case SCAN_EQUAL:
- -                      /* Scan lists relative to size */
- -                      break;
- -              case SCAN_FRACT:
+ +                      switch (scan_balance) {
+ +                      case SCAN_EQUAL:
+ +                              /* Scan lists relative to size */
+ +                              break;
+ +                      case SCAN_FRACT:
+ +                              /*
+ +                               * Scan types proportional to swappiness and
+ +                               * their relative recent reclaim efficiency.
+ +                               */
+ +                              scan = div64_u64(scan * fraction[file],
+ +                                                      denominator);
+ +                              break;
+ +                      case SCAN_FILE:
+ +                      case SCAN_ANON:
+ +                              /* Scan one type exclusively */
+ +                              if ((scan_balance == SCAN_FILE) != file)
+ +                                      scan = 0;
+ +                              break;
+ +                      default:
+ +                              /* Look ma, no brain */
+ +                              BUG();
+ +                      }
+ +                      nr[lru] = scan;
                         /*
- -                       * Scan types proportional to swappiness and
- -                       * their relative recent reclaim efficiency.
+ +                       * Skip the second pass and don't force_scan,
+ +                       * if we found something to scan.
                          */
- -                      scan = div64_u64(scan * fraction[file], denominator);
- -                      break;
- -              case SCAN_FILE:
- -              case SCAN_ANON:
- -                      /* Scan one type exclusively */
- -                      if ((scan_balance == SCAN_FILE) != file)
- -                              scan = 0;
- -                      break;
- -              default:
- -                      /* Look ma, no brain */
- -                      BUG();
+ +                      some_scanned |= !!scan;
                 }
- -              nr[lru] = scan;
         }
   }
   
@@@ -2061,27 -2037,13 +2061,27 @@@ static void shrink_lruvec(struct lruve
         unsigned long nr_reclaimed = 0;
         unsigned long nr_to_reclaim = sc->nr_to_reclaim;
         struct blk_plug plug;
- -      bool scan_adjusted = false;
+ +      bool scan_adjusted;
   
         get_scan_count(lruvec, sc, nr);
   
         /* Record the original scan target for proportional adjustments later */
         memcpy(targets, nr, sizeof(nr));
   
+ +      /*
+ +       * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
+ +       * event that can occur when there is little memory pressure e.g.
+ +       * multiple streaming readers/writers. Hence, we do not abort scanning
+ +       * when the requested number of pages are reclaimed when scanning at
+ +       * DEF_PRIORITY on the assumption that the fact we are direct
+ +       * reclaiming implies that kswapd is not keeping up and it is best to
+ +       * do a batch of work at once. For memcg reclaim one check is made to
+ +       * abort proportional reclaim if either the file or anon lru has already
+ +       * dropped to zero at the first pass.
+ +       */
+ +      scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
+ +                       sc->priority == DEF_PRIORITY);
+ +
         blk_start_plug(&plug);
         while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                         nr[LRU_INACTIVE_FILE]) {
@@@ -2101,9 -2063,18 +2101,9 @@@
                 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
                         continue;
   
- -              /*
- -               * For global direct reclaim, reclaim only the number of pages
- -               * requested. Less care is taken to scan proportionally as it
- -               * is more important to minimise direct reclaim stall latency
- -               * than it is to properly age the LRU lists.
- -               */
- -              if (global_reclaim(sc) && !current_is_kswapd())
- -                      break;
- -
                 /*
                  * For kswapd and memcg, reclaim at least the number of pages
- -               * requested. Ensure that the anon and file LRUs shrink
+ +               * requested. Ensure that the anon and file LRUs are scanned
                  * proportionally what was requested by get_scan_count(). We
                  * stop reclaiming one LRU and reduce the amount scanning
                  * proportional to the original scan target.
@@@ -2111,15 -2082,6 +2111,15 @@@
                 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
                 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
   
+ +              /*
+ +               * It's just vindictive to attack the larger once the smaller
+ +               * has gone to zero.  And given the way we stop scanning the
+ +               * smaller below, this makes sure that we only make one nudge
+ +               * towards proportionality once we've got nr_to_reclaim.
+ +               */
+ +              if (!nr_file || !nr_anon)
+ +                      break;
+ +
                 if (nr_file > nr_anon) {
                         unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
                                                 targets[LRU_ACTIVE_ANON] + 1;
@@@ -2262,7 -2224,6 +2262,7 @@@ static void shrink_zone(struct zone *zo
   
                         lruvec = mem_cgroup_zone_lruvec(zone, memcg);
   
+ +                      sc->swappiness = mem_cgroup_swappiness(memcg);
                         shrink_lruvec(lruvec, sc);
   
                         /*
@@@ -2307,8 -2268,9 +2307,8 @@@ static inline bool compaction_ready(str
          * there is a buffer of free pages available to give compaction
          * a reasonable chance of completing and allocating the page
          */
- -      balance_gap = min(low_wmark_pages(zone),
- -              (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
- -                      KSWAPD_ZONE_BALANCE_GAP_RATIO);
+ +      balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
+ +                      zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
         watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
         watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
   
@@@ -2563,17 -2525,10 +2563,17 @@@ static bool pfmemalloc_watermark_ok(pg_
   
         for (i = 0; i <= ZONE_NORMAL; i++) {
                 zone = &pgdat->node_zones[i];
+ +              if (!populated_zone(zone))
+ +                      continue;
+ +
                 pfmemalloc_reserve += min_wmark_pages(zone);
                 free_pages += zone_page_state(zone, NR_FREE_PAGES);
         }
   
+ +      /* If there are no reserves (unexpected config) then do not throttle */
+ +      if (!pfmemalloc_reserve)
+ +              return true;
+ +
         wmark_ok = free_pages > pfmemalloc_reserve / 2;
   
         /* kswapd must be awake if processes are being throttled */
@@@ -2598,9 -2553,9 +2598,9 @@@
   static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
                                         nodemask_t *nodemask)
   {
+ +      struct zoneref *z;
         struct zone *zone;
- -      int high_zoneidx = gfp_zone(gfp_mask);
- -      pg_data_t *pgdat;
+ +      pg_data_t *pgdat = NULL;
   
         /*
          * Kernel threads should not be throttled as they may be indirectly
@@@ -2619,34 -2574,10 +2619,34 @@@
         if (fatal_signal_pending(current))
                 goto out;
   
- -      /* Check if the pfmemalloc reserves are ok */
- -      first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
- -      pgdat = zone->zone_pgdat;
- -      if (pfmemalloc_watermark_ok(pgdat))
+ +      /*
+ +       * Check if the pfmemalloc reserves are ok by finding the first node
+ +       * with a usable ZONE_NORMAL or lower zone. The expectation is that
+ +       * GFP_KERNEL will be required for allocating network buffers when
+ +       * swapping over the network so ZONE_HIGHMEM is unusable.
+ +       *
+ +       * Throttling is based on the first usable node and throttled processes
+ +       * wait on a queue until kswapd makes progress and wakes them. There
+ +       * is an affinity then between processes waking up and where reclaim
+ +       * progress has been made assuming the process wakes on the same node.
+ +       * More importantly, processes running on remote nodes will not compete
+ +       * for remote pfmemalloc reserves and processes on different nodes
+ +       * should make reasonable progress.
+ +       */
+ +      for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ +                                      gfp_mask, nodemask) {
+ +              if (zone_idx(zone) > ZONE_NORMAL)
+ +                      continue;
+ +
+ +              /* Throttle based on the first usable node */
+ +              pgdat = zone->zone_pgdat;
+ +              if (pfmemalloc_watermark_ok(pgdat))
+ +                      goto out;
+ +              break;
+ +      }
+ +
+ +      /* If no zone was usable by the allocation flags then do not throttle */
+ +      if (!pgdat)
                 goto out;
   
         /* Account for the throttling */
@@@ -2729,7 -2660,6 +2729,7 @@@ unsigned long mem_cgroup_shrink_node_zo
                 .may_swap = !noswap,
                 .order = 0,
                 .priority = 0,
+ +              .swappiness = mem_cgroup_swappiness(memcg),
                 .target_mem_cgroup = memcg,
         };
         struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
@@@ -2961,8 -2891,9 +2961,8 @@@ static bool kswapd_shrink_zone(struct z
          * high wmark plus a "gap" where the gap is either the low
          * watermark or 1% of the zone, whichever is smaller.
          */
- -      balance_gap = min(low_wmark_pages(zone),
- -              (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
- -              KSWAPD_ZONE_BALANCE_GAP_RATIO);
+ +      balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
+ +                      zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
   
         /*
          * If there is no low memory pressure or the zone is balanced then no
@@@ -3371,10 -3302,7 +3371,10 @@@ static int kswapd(void *p
                 }
         }
   
+ +      tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
         current->reclaim_state = NULL;
+ +      lockdep_clear_current_reclaim_state();
+ +
         return 0;
   }
   
@@@ -3494,7 -3422,7 +3494,7 @@@ int kswapd_run(int nid
   
   /*
    * Called by memory hotplug when all memory in a node is offlined.  Caller must
- - * hold lock_memory_hotplug().
+ + * hold mem_hotplug_begin/end().
    */
   void kswapd_stop(int nid)
   {
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 12 Jun 2014 17:30:18 +0000 (10:30 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 12 Jun 2014 17:30:18 +0000 (10:30 -0700)
		1	2
drivers/staging/lustre/lustre/lclient/lcommon_cl.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/staging/lustre/lustre/llite/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/staging/lustre/lustre/llite/llite_internal.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/staging/lustre/lustre/llite/rw.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/staging/lustre/lustre/llite/rw26.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/staging/lustre/lustre/llite/vvp_io.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/usb/gadget/storage_common.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/9p/vfs_file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/affs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/block_dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ceph/addr.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/cifs/cifsfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/cifs/cifsfs.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/cifs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/dcache.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext3/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/ext4.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/f2fs/data.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/f2fs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/fat/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/file_table.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/fuse/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/gfs2/aops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/gfs2/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/direct.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/internal.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/nfs4file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ntfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/reiserfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/reiserfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ubifs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_aops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_trace.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blk_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/nfs_fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/filemap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_io.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/shmem.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/vmscan.c	patch \|	diff1 \|	diff2 \|	blob \| history