Merge tag 'xfs-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...

[sfrench/cifs-2.6.git] / fs / xfs / xfs_file.c
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index 47fc632954228febc4e3a5bac611dbc121137348..ed95e5bb04e692b614983ea83bf870ac373a9b0c 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -37,6 +37,7 @@
  #include "xfs_log.h"
  #include "xfs_icache.h"
  #include "xfs_pnfs.h"
+#include "xfs_iomap.h"
  
  #include <linux/dcache.h>
  #include <linux/falloc.h>
@@ -80,61 +81,17 @@ xfs_rw_ilock_demote(
  }
  
  /*
- * xfs_iozero clears the specified range supplied via the page cache (except in
- * the DAX case). Writes through the page cache will allocate blocks over holes,
- * though the callers usually map the holes first and avoid them. If a block is
- * not completely zeroed, then it will be read from disk before being partially
- * zeroed.
- *
- * In the DAX case, we can just directly write to the underlying pages. This
- * will not allocate blocks, but will avoid holes and unwritten extents and so
- * not do unnecessary work.
+ * Clear the specified ranges to zero through either the pagecache or DAX.
+ * Holes and unwritten extents will be left as-is as they already are zeroed.
   */
  int
-xfs_iozero(
-       struct xfs_inode        *ip,    /* inode                        */
-       loff_t                  pos,    /* offset in file               */
-       size_t                  count)  /* size of data to zero         */
+xfs_zero_range(
+       struct xfs_inode        *ip,
+       xfs_off_t               pos,
+       xfs_off_t               count,
+       bool                    *did_zero)
  {
-       struct page             *page;
-       struct address_space    *mapping;
-       int                     status = 0;
-
-
-       mapping = VFS_I(ip)->i_mapping;
-       do {
-               unsigned offset, bytes;
-               void *fsdata;
-
-               offset = (pos & (PAGE_SIZE -1)); /* Within page */
-               bytes = PAGE_SIZE - offset;
-               if (bytes > count)
-                       bytes = count;
-
-               if (IS_DAX(VFS_I(ip))) {
-                       status = dax_zero_page_range(VFS_I(ip), pos, bytes,
-                                                    xfs_get_blocks_direct);
-                       if (status)
-                               break;
-               } else {
-                       status = pagecache_write_begin(NULL, mapping, pos, bytes,
-                                               AOP_FLAG_UNINTERRUPTIBLE,
-                                               &page, &fsdata);
-                       if (status)
-                               break;
-
-                       zero_user(page, offset, bytes);
-
-                       status = pagecache_write_end(NULL, mapping, pos, bytes,
-                                               bytes, page, fsdata);
-                       WARN_ON(status <= 0); /* can't return less than zero! */
-                       status = 0;
-               }
-               pos += bytes;
-               count -= bytes;
-       } while (count);
-
-       return status;
+       return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
  }
  
  int
@@ -282,48 +239,35 @@ xfs_file_fsync(
  }
  
  STATIC ssize_t
-xfs_file_read_iter(
+xfs_file_dio_aio_read(
         struct kiocb            *iocb,
         struct iov_iter         *to)
  {
-       struct file             *file = iocb->ki_filp;
-       struct inode            *inode = file->f_mapping->host;
+       struct address_space    *mapping = iocb->ki_filp->f_mapping;
+       struct inode            *inode = mapping->host;
         struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       size_t                  size = iov_iter_count(to);
+       loff_t                  isize = i_size_read(inode);
+       size_t                  count = iov_iter_count(to);
+       struct iov_iter         data;
+       struct xfs_buftarg      *target;
         ssize_t                 ret = 0;
-       int                     ioflags = 0;
-       xfs_fsize_t             n;
-       loff_t                  pos = iocb->ki_pos;
  
-       XFS_STATS_INC(mp, xs_read_calls);
-
-       if (unlikely(iocb->ki_flags & IOCB_DIRECT))
-               ioflags |= XFS_IO_ISDIRECT;
-       if (file->f_mode & FMODE_NOCMTIME)
-               ioflags |= XFS_IO_INVIS;
-
-       if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
-               xfs_buftarg_t   *target =
-                       XFS_IS_REALTIME_INODE(ip) ?
-                               mp->m_rtdev_targp : mp->m_ddev_targp;
-               /* DIO must be aligned to device logical sector size */
-               if ((pos | size) & target->bt_logical_sectormask) {
-                       if (pos == i_size_read(inode))
-                               return 0;
-                       return -EINVAL;
-               }
-       }
+       trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
  
-       n = mp->m_super->s_maxbytes - pos;
-       if (n <= 0 || size == 0)
-               return 0;
+       if (!count)
+               return 0; /* skip atime */
  
-       if (n < size)
-               size = n;
+       if (XFS_IS_REALTIME_INODE(ip))
+               target = ip->i_mount->m_rtdev_targp;
+       else
+               target = ip->i_mount->m_ddev_targp;
  
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -EIO;
+       /* DIO must be aligned to device logical sector size */
+       if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
+               if (iocb->ki_pos == isize)
+                       return 0;
+               return -EINVAL;
+       }
  
         /*
          * Locking is a bit tricky here. If we take an exclusive lock for direct
@@ -336,7 +280,7 @@ xfs_file_read_iter(
          * serialisation.
          */
         xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-       if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
+       if (mapping->nrpages) {
                 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
                 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
  
@@ -351,8 +295,8 @@ xfs_file_read_iter(
                  * flush and reduce the chances of repeated iolock cycles going
                  * forward.
                  */
-               if (inode->i_mapping->nrpages) {
-                       ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+               if (mapping->nrpages) {
+                       ret = filemap_write_and_wait(mapping);
                         if (ret) {
                                 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
                                 return ret;
@@ -363,20 +307,95 @@ xfs_file_read_iter(
                          * we fail to invalidate a page, but this should never
                          * happen on XFS. Warn if it does fail.
                          */
-                       ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
+                       ret = invalidate_inode_pages2(mapping);
                         WARN_ON_ONCE(ret);
                         ret = 0;
                 }
                 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
         }
  
-       trace_xfs_file_read(ip, size, pos, ioflags);
+       data = *to;
+       ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+                       xfs_get_blocks_direct, NULL, NULL, 0);
+       if (ret > 0) {
+               iocb->ki_pos += ret;
+               iov_iter_advance(to, ret);
+       }
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
  
+       file_accessed(iocb->ki_filp);
+       return ret;
+}
+
+static noinline ssize_t
+xfs_file_dax_read(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+{
+       struct address_space    *mapping = iocb->ki_filp->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct iov_iter         data = *to;
+       size_t                  count = iov_iter_count(to);
+       ssize_t                 ret = 0;
+
+       trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+
+       if (!count)
+               return 0; /* skip atime */
+
+       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+       ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
+       if (ret > 0) {
+               iocb->ki_pos += ret;
+               iov_iter_advance(to, ret);
+       }
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+       file_accessed(iocb->ki_filp);
+       return ret;
+}
+
+STATIC ssize_t
+xfs_file_buffered_aio_read(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+{
+       struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
+       ssize_t                 ret;
+
+       trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+
+       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
         ret = generic_file_read_iter(iocb, to);
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+       return ret;
+}
+
+STATIC ssize_t
+xfs_file_read_iter(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+{
+       struct inode            *inode = file_inode(iocb->ki_filp);
+       struct xfs_mount        *mp = XFS_I(inode)->i_mount;
+       ssize_t                 ret = 0;
+
+       XFS_STATS_INC(mp, xs_read_calls);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+
+       if (IS_DAX(inode))
+               ret = xfs_file_dax_read(iocb, to);
+       else if (iocb->ki_flags & IOCB_DIRECT)
+               ret = xfs_file_dio_aio_read(iocb, to);
+       else
+               ret = xfs_file_buffered_aio_read(iocb, to);
+
         if (ret > 0)
                 XFS_STATS_ADD(mp, xs_read_bytes, ret);
-
-       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
         return ret;
  }
  
@@ -389,18 +408,14 @@ xfs_file_splice_read(
         unsigned int            flags)
  {
         struct xfs_inode        *ip = XFS_I(infilp->f_mapping->host);
-       int                     ioflags = 0;
         ssize_t                 ret;
  
         XFS_STATS_INC(ip->i_mount, xs_read_calls);
  
-       if (infilp->f_mode & FMODE_NOCMTIME)
-               ioflags |= XFS_IO_INVIS;
-
         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                 return -EIO;
  
-       trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+       trace_xfs_file_splice_read(ip, count, *ppos);
  
         /*
          * DAX inodes cannot ues the page cache for splice, so we have to push
@@ -423,49 +438,6 @@ out:
         return ret;
  }
  
-/*
- * This routine is called to handle zeroing any space in the last block of the
- * file that is beyond the EOF.  We do this since the size is being increased
- * without writing anything to that block and we don't want to read the
- * garbage on the disk.
- */
-STATIC int                             /* error (positive) */
-xfs_zero_last_block(
-       struct xfs_inode        *ip,
-       xfs_fsize_t             offset,
-       xfs_fsize_t             isize,
-       bool                    *did_zeroing)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_fileoff_t           last_fsb = XFS_B_TO_FSBT(mp, isize);
-       int                     zero_offset = XFS_B_FSB_OFFSET(mp, isize);
-       int                     zero_len;
-       int                     nimaps = 1;
-       int                     error = 0;
-       struct xfs_bmbt_irec    imap;
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       if (error)
-               return error;
-
-       ASSERT(nimaps > 0);
-
-       /*
-        * If the block underlying isize is just a hole, then there
-        * is nothing to zero.
-        */
-       if (imap.br_startblock == HOLESTARTBLOCK)
-               return 0;
-
-       zero_len = mp->m_sb.sb_blocksize - zero_offset;
-       if (isize + zero_len > offset)
-               zero_len = offset - isize;
-       *did_zeroing = true;
-       return xfs_iozero(ip, isize, zero_len);
-}
-
  /*
   * Zero any on disk space between the current EOF and the new, larger EOF.
   *
@@ -484,94 +456,11 @@ xfs_zero_eof(
         xfs_fsize_t             isize,          /* current inode size */
         bool                    *did_zeroing)
  {
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_fileoff_t           start_zero_fsb;
-       xfs_fileoff_t           end_zero_fsb;
-       xfs_fileoff_t           zero_count_fsb;
-       xfs_fileoff_t           last_fsb;
-       xfs_fileoff_t           zero_off;
-       xfs_fsize_t             zero_len;
-       int                     nimaps;
-       int                     error = 0;
-       struct xfs_bmbt_irec    imap;
-
         ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
         ASSERT(offset > isize);
  
         trace_xfs_zero_eof(ip, isize, offset - isize);
-
-       /*
-        * First handle zeroing the block on which isize resides.
-        *
-        * We only zero a part of that block so it is handled specially.
-        */
-       if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
-               error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
-               if (error)
-                       return error;
-       }
-
-       /*
-        * Calculate the range between the new size and the old where blocks
-        * needing to be zeroed may exist.
-        *
-        * To get the block where the last byte in the file currently resides,
-        * we need to subtract one from the size and truncate back to a block
-        * boundary.  We subtract 1 in case the size is exactly on a block
-        * boundary.
-        */
-       last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
-       start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
-       end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
-       ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
-       if (last_fsb == end_zero_fsb) {
-               /*
-                * The size was only incremented on its last block.
-                * We took care of that above, so just return.
-                */
-               return 0;
-       }
-
-       ASSERT(start_zero_fsb <= end_zero_fsb);
-       while (start_zero_fsb <= end_zero_fsb) {
-               nimaps = 1;
-               zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
-
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
-                                         &imap, &nimaps, 0);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               if (error)
-                       return error;
-
-               ASSERT(nimaps > 0);
-
-               if (imap.br_state == XFS_EXT_UNWRITTEN ||
-                   imap.br_startblock == HOLESTARTBLOCK) {
-                       start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-                       ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-                       continue;
-               }
-
-               /*
-                * There are blocks we need to zero.
-                */
-               zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
-               zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
-
-               if ((zero_off + zero_len) > offset)
-                       zero_len = offset - zero_off;
-
-               error = xfs_iozero(ip, zero_off, zero_len);
-               if (error)
-                       return error;
-
-               *did_zeroing = true;
-               start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-               ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-       }
-
-       return 0;
+       return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
  }
  
  /*
@@ -722,8 +611,7 @@ xfs_file_dio_aio_write(
                                         mp->m_rtdev_targp : mp->m_ddev_targp;
  
         /* DIO must be aligned to device logical sector size */
-       if (!IS_DAX(inode) &&
-           ((iocb->ki_pos | count) & target->bt_logical_sectormask))
+       if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
                 return -EINVAL;
  
         /* "unaligned" here means not aligned to a filesystem block */
@@ -762,7 +650,7 @@ xfs_file_dio_aio_write(
         end = iocb->ki_pos + count - 1;
  
         /*
-        * See xfs_file_read_iter() for why we do a full-file flush here.
+        * See xfs_file_dio_aio_read() for why we do a full-file flush here.
          */
         if (mapping->nrpages) {
                 ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
@@ -789,10 +677,12 @@ xfs_file_dio_aio_write(
                 iolock = XFS_IOLOCK_SHARED;
         }
  
-       trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+       trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
  
         data = *from;
-       ret = mapping->a_ops->direct_IO(iocb, &data);
+       ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+                       xfs_get_blocks_direct, xfs_end_io_direct_write,
+                       NULL, DIO_ASYNC_EXTEND);
  
         /* see generic_file_direct_write() for why this is necessary */
         if (mapping->nrpages) {
@@ -809,10 +699,70 @@ out:
         xfs_rw_iunlock(ip, iolock);
  
         /*
-        * No fallback to buffered IO on errors for XFS. DAX can result in
-        * partial writes, but direct IO will either complete fully or fail.
+        * No fallback to buffered IO on errors for XFS, direct IO will either
+        * complete fully or fail.
          */
-       ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
+       ASSERT(ret < 0 || ret == count);
+       return ret;
+}
+
+static noinline ssize_t
+xfs_file_dax_write(
+       struct kiocb            *iocb,
+       struct iov_iter         *from)
+{
+       struct address_space    *mapping = iocb->ki_filp->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       ssize_t                 ret = 0;
+       int                     unaligned_io = 0;
+       int                     iolock;
+       struct iov_iter         data;
+
+       /* "unaligned" here means not aligned to a filesystem block */
+       if ((iocb->ki_pos & mp->m_blockmask) ||
+           ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
+               unaligned_io = 1;
+               iolock = XFS_IOLOCK_EXCL;
+       } else if (mapping->nrpages) {
+               iolock = XFS_IOLOCK_EXCL;
+       } else {
+               iolock = XFS_IOLOCK_SHARED;
+       }
+       xfs_rw_ilock(ip, iolock);
+
+       ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+       if (ret)
+               goto out;
+
+       /*
+        * Yes, even DAX files can have page cache attached to them:  A zeroed
+        * page is inserted into the pagecache when we have to serve a write
+        * fault on a hole.  It should never be dirtied and can simply be
+        * dropped from the pagecache once we get real data for the page.
+        */
+       if (mapping->nrpages) {
+               ret = invalidate_inode_pages2(mapping);
+               WARN_ON_ONCE(ret);
+       }
+
+       if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
+               xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+               iolock = XFS_IOLOCK_SHARED;
+       }
+
+       trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
+
+       data = *from;
+       ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
+                       xfs_end_io_direct_write, 0);
+       if (ret > 0) {
+               iocb->ki_pos += ret;
+               iov_iter_advance(from, ret);
+       }
+out:
+       xfs_rw_iunlock(ip, iolock);
         return ret;
  }
  
@@ -839,9 +789,8 @@ xfs_file_buffered_aio_write(
         current->backing_dev_info = inode_to_bdi(inode);
  
  write_retry:
-       trace_xfs_file_buffered_write(ip, iov_iter_count(from),
-                                     iocb->ki_pos, 0);
-       ret = generic_perform_write(file, from, iocb->ki_pos);
+       trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+       ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
         if (likely(ret >= 0))
                 iocb->ki_pos += ret;
  
@@ -895,7 +844,9 @@ xfs_file_write_iter(
         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                 return -EIO;
  
-       if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
+       if (IS_DAX(inode))
+               ret = xfs_file_dax_write(iocb, from);
+       else if (iocb->ki_flags & IOCB_DIRECT)
                 ret = xfs_file_dio_aio_write(iocb, from);
         else
                 ret = xfs_file_buffered_aio_write(iocb, from);
@@ -1551,9 +1502,9 @@ xfs_filemap_page_mkwrite(
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
         if (IS_DAX(inode)) {
-               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
+               ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
         } else {
-               ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+               ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
                 ret = block_page_mkwrite_return(ret);
         }
  
@@ -1585,7 +1536,7 @@ xfs_filemap_fault(
                  * changes to xfs_get_blocks_direct() to map unwritten extent
                  * ioend for conversion on read-only mappings.
                  */
-               ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
+               ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
         } else
                 ret = filemap_fault(vma, vmf);
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1622,7 +1573,7 @@ xfs_filemap_pmd_fault(
         }
  
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-       ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
+       ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
         if (flags & FAULT_FLAG_WRITE)