Merge tag 'xfs-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...

[sfrench/cifs-2.6.git] / fs / xfs / xfs_aops.c
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c

index 4c463b99fe574341043cb1f6ab612a59df881d31..7575cfc3ad15671b7b6242a241cca2c54b7d111d 100644 (file)
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -87,6 +87,12 @@ xfs_find_bdev_for_inode(
   * We're now finished for good with this page.  Update the page state via the
   * associated buffer_heads, paying attention to the start and end offsets that
   * we need to process on the page.
+ *
+ * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
+ * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
+ * the page at all, as we may be racing with memory reclaim and it can free both
+ * the bufferhead chain and the page as it will see the page as clean and
+ * unused.
   */
  static void
  xfs_finish_page_writeback(
@@ -95,8 +101,9 @@ xfs_finish_page_writeback(
         int                     error)
  {
         unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
-       struct buffer_head      *head, *bh;
+       struct buffer_head      *head, *bh, *next;
         unsigned int            off = 0;
+       unsigned int            bsize;
  
         ASSERT(bvec->bv_offset < PAGE_SIZE);
         ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
@@ -105,15 +112,17 @@ xfs_finish_page_writeback(
  
         bh = head = page_buffers(bvec->bv_page);
  
+       bsize = bh->b_size;
         do {
+               next = bh->b_this_page;
                 if (off < bvec->bv_offset)
                         goto next_bh;
                 if (off > end)
                         break;
                 bh->b_end_io(bh, !error);
  next_bh:
-               off += bh->b_size;
-       } while ((bh = bh->b_this_page) != head);
+               off += bsize;
+       } while ((bh = next) != head);
  }
  
  /*
@@ -438,7 +447,8 @@ xfs_submit_ioend(
  
         ioend->io_bio->bi_private = ioend;
         ioend->io_bio->bi_end_io = xfs_end_bio;
-
+       bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
+                        (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
         /*
          * If we are failing the IO now, just mark the ioend with an
          * error and finish it. This will run IO completion immediately
@@ -451,8 +461,7 @@ xfs_submit_ioend(
                 return status;
         }
  
-       submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
-                  ioend->io_bio);
+       submit_bio(ioend->io_bio);
         return 0;
  }
  
@@ -510,8 +519,9 @@ xfs_chain_bio(
  
         bio_chain(ioend->io_bio, new);
         bio_get(ioend->io_bio);         /* for xfs_destroy_ioend */
-       submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
-                  ioend->io_bio);
+       bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
+                         (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+       submit_bio(ioend->io_bio);
         ioend->io_bio = new;
  }
  
@@ -1040,6 +1050,20 @@ xfs_vm_releasepage(
  
         trace_xfs_releasepage(page->mapping->host, page, 0, 0);
  
+       /*
+        * mm accommodates an old ext3 case where clean pages might not have had
+        * the dirty bit cleared. Thus, it can send actual dirty pages to
+        * ->releasepage() via shrink_active_list(). Conversely,
+        * block_invalidatepage() can send pages that are still marked dirty
+        * but otherwise have invalidated buffers.
+        *
+        * We've historically freed buffers on the latter. Instead, quietly
+        * filter out all dirty pages to avoid spurious buffer state warnings.
+        * This can likely be removed once shrink_active_list() is fixed.
+        */
+       if (PageDirty(page))
+               return 0;
+
         xfs_count_page_state(page, &delalloc, &unwritten);
  
         if (WARN_ON_ONCE(delalloc))
@@ -1143,6 +1167,8 @@ __xfs_get_blocks(
         ssize_t                 size;
         int                     new = 0;
  
+       BUG_ON(create && !direct);
+
         if (XFS_FORCED_SHUTDOWN(mp))
                 return -EIO;
  
@@ -1150,22 +1176,14 @@ __xfs_get_blocks(
         ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
         size = bh_result->b_size;
  
-       if (!create && direct && offset >= i_size_read(inode))
+       if (!create && offset >= i_size_read(inode))
                 return 0;
  
         /*
          * Direct I/O is usually done on preallocated files, so try getting
-        * a block mapping without an exclusive lock first.  For buffered
-        * writes we already have the exclusive iolock anyway, so avoiding
-        * a lock roundtrip here by taking the ilock exclusive from the
-        * beginning is a useful micro optimization.
+        * a block mapping without an exclusive lock first.
          */
-       if (create && !direct) {
-               lockmode = XFS_ILOCK_EXCL;
-               xfs_ilock(ip, lockmode);
-       } else {
-               lockmode = xfs_ilock_data_map_shared(ip);
-       }
+       lockmode = xfs_ilock_data_map_shared(ip);
  
         ASSERT(offset <= mp->m_super->s_maxbytes);
         if (offset + size > mp->m_super->s_maxbytes)
@@ -1184,37 +1202,19 @@ __xfs_get_blocks(
              (imap.br_startblock == HOLESTARTBLOCK ||
               imap.br_startblock == DELAYSTARTBLOCK) ||
              (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
-               if (direct || xfs_get_extsz_hint(ip)) {
-                       /*
-                        * xfs_iomap_write_direct() expects the shared lock. It
-                        * is unlocked on return.
-                        */
-                       if (lockmode == XFS_ILOCK_EXCL)
-                               xfs_ilock_demote(ip, lockmode);
-
-                       error = xfs_iomap_write_direct(ip, offset, size,
-                                                      &imap, nimaps);
-                       if (error)
-                               return error;
-                       new = 1;
+               /*
+                * xfs_iomap_write_direct() expects the shared lock. It
+                * is unlocked on return.
+                */
+               if (lockmode == XFS_ILOCK_EXCL)
+                       xfs_ilock_demote(ip, lockmode);
  
-               } else {
-                       /*
-                        * Delalloc reservations do not require a transaction,
-                        * we can go on without dropping the lock here. If we
-                        * are allocating a new delalloc block, make sure that
-                        * we set the new flag so that we mark the buffer new so
-                        * that we know that it is newly allocated if the write
-                        * fails.
-                        */
-                       if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
-                               new = 1;
-                       error = xfs_iomap_write_delay(ip, offset, size, &imap);
-                       if (error)
-                               goto out_unlock;
+               error = xfs_iomap_write_direct(ip, offset, size,
+                                              &imap, nimaps);
+               if (error)
+                       return error;
+               new = 1;
  
-                       xfs_iunlock(ip, lockmode);
-               }
                 trace_xfs_get_blocks_alloc(ip, offset, size,
                                 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
                                                    : XFS_IO_DELALLOC, &imap);
@@ -1235,9 +1235,7 @@ __xfs_get_blocks(
         }
  
         /* trim mapping down to size requested */
-       if (direct || size > (1 << inode->i_blkbits))
-               xfs_map_trim_size(inode, iblock, bh_result,
-                                 &imap, offset, size);
+       xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
  
         /*
          * For unwritten extents do not report a disk address in the buffered
@@ -1250,7 +1248,7 @@ __xfs_get_blocks(
                 if (ISUNWRITTEN(&imap))
                         set_buffer_unwritten(bh_result);
                 /* direct IO needs special help */
-               if (create && direct) {
+               if (create) {
                         if (dax_fault)
                                 ASSERT(!ISUNWRITTEN(&imap));
                         else
@@ -1279,14 +1277,7 @@ __xfs_get_blocks(
              (new || ISUNWRITTEN(&imap))))
                 set_buffer_new(bh_result);
  
-       if (imap.br_startblock == DELAYSTARTBLOCK) {
-               BUG_ON(direct);
-               if (create) {
-                       set_buffer_uptodate(bh_result);
-                       set_buffer_mapped(bh_result);
-                       set_buffer_delay(bh_result);
-               }
-       }
+       BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
  
         return 0;
  
@@ -1336,7 +1327,7 @@ xfs_get_blocks_dax_fault(
   * whereas if we have flags set we will always be called in task context
   * (i.e. from a workqueue).
   */
-STATIC int
+int
  xfs_end_io_direct_write(
         struct kiocb            *iocb,
         loff_t                  offset,
@@ -1407,234 +1398,10 @@ xfs_vm_direct_IO(
         struct kiocb            *iocb,
         struct iov_iter         *iter)
  {
-       struct inode            *inode = iocb->ki_filp->f_mapping->host;
-       dio_iodone_t            *endio = NULL;
-       int                     flags = 0;
-       struct block_device     *bdev;
-
-       if (iov_iter_rw(iter) == WRITE) {
-               endio = xfs_end_io_direct_write;
-               flags = DIO_ASYNC_EXTEND;
-       }
-
-       if (IS_DAX(inode)) {
-               return dax_do_io(iocb, inode, iter,
-                                xfs_get_blocks_direct, endio, 0);
-       }
-
-       bdev = xfs_find_bdev_for_inode(inode);
-       return  __blockdev_direct_IO(iocb, inode, bdev, iter,
-                       xfs_get_blocks_direct, endio, NULL, flags);
-}
-
-/*
- * Punch out the delalloc blocks we have already allocated.
- *
- * Don't bother with xfs_setattr given that nothing can have made it to disk yet
- * as the page is still locked at this point.
- */
-STATIC void
-xfs_vm_kill_delalloc_range(
-       struct inode            *inode,
-       loff_t                  start,
-       loff_t                  end)
-{
-       struct xfs_inode        *ip = XFS_I(inode);
-       xfs_fileoff_t           start_fsb;
-       xfs_fileoff_t           end_fsb;
-       int                     error;
-
-       start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
-       end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
-       if (end_fsb <= start_fsb)
-               return;
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
-                                               end_fsb - start_fsb);
-       if (error) {
-               /* something screwed, just bail */
-               if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                       xfs_alert(ip->i_mount,
-               "xfs_vm_write_failed: unable to clean up ino %lld",
-                                       ip->i_ino);
-               }
-       }
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-}
-
-STATIC void
-xfs_vm_write_failed(
-       struct inode            *inode,
-       struct page             *page,
-       loff_t                  pos,
-       unsigned                len)
-{
-       loff_t                  block_offset;
-       loff_t                  block_start;
-       loff_t                  block_end;
-       loff_t                  from = pos & (PAGE_SIZE - 1);
-       loff_t                  to = from + len;
-       struct buffer_head      *bh, *head;
-       struct xfs_mount        *mp = XFS_I(inode)->i_mount;
-
         /*
-        * The request pos offset might be 32 or 64 bit, this is all fine
-        * on 64-bit platform.  However, for 64-bit pos request on 32-bit
-        * platform, the high 32-bit will be masked off if we evaluate the
-        * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
-        * 0xfffff000 as an unsigned long, hence the result is incorrect
-        * which could cause the following ASSERT failed in most cases.
-        * In order to avoid this, we can evaluate the block_offset of the
-        * start of the page by using shifts rather than masks the mismatch
-        * problem.
+        * We just need the method present so that open/fcntl allow direct I/O.
          */
-       block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;
-
-       ASSERT(block_offset + from == pos);
-
-       head = page_buffers(page);
-       block_start = 0;
-       for (bh = head; bh != head || !block_start;
-            bh = bh->b_this_page, block_start = block_end,
-                                  block_offset += bh->b_size) {
-               block_end = block_start + bh->b_size;
-
-               /* skip buffers before the write */
-               if (block_end <= from)
-                       continue;
-
-               /* if the buffer is after the write, we're done */
-               if (block_start >= to)
-                       break;
-
-               /*
-                * Process delalloc and unwritten buffers beyond EOF. We can
-                * encounter unwritten buffers in the event that a file has
-                * post-EOF unwritten extents and an extending write happens to
-                * fail (e.g., an unaligned write that also involves a delalloc
-                * to the same page).
-                */
-               if (!buffer_delay(bh) && !buffer_unwritten(bh))
-                       continue;
-
-               if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
-                   block_offset < i_size_read(inode))
-                       continue;
-
-               if (buffer_delay(bh))
-                       xfs_vm_kill_delalloc_range(inode, block_offset,
-                                                  block_offset + bh->b_size);
-
-               /*
-                * This buffer does not contain data anymore. make sure anyone
-                * who finds it knows that for certain.
-                */
-               clear_buffer_delay(bh);
-               clear_buffer_uptodate(bh);
-               clear_buffer_mapped(bh);
-               clear_buffer_new(bh);
-               clear_buffer_dirty(bh);
-               clear_buffer_unwritten(bh);
-       }
-
-}
-
-/*
- * This used to call block_write_begin(), but it unlocks and releases the page
- * on error, and we need that page to be able to punch stale delalloc blocks out
- * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
- * the appropriate point.
- */
-STATIC int
-xfs_vm_write_begin(
-       struct file             *file,
-       struct address_space    *mapping,
-       loff_t                  pos,
-       unsigned                len,
-       unsigned                flags,
-       struct page             **pagep,
-       void                    **fsdata)
-{
-       pgoff_t                 index = pos >> PAGE_SHIFT;
-       struct page             *page;
-       int                     status;
-       struct xfs_mount        *mp = XFS_I(mapping->host)->i_mount;
-
-       ASSERT(len <= PAGE_SIZE);
-
-       page = grab_cache_page_write_begin(mapping, index, flags);
-       if (!page)
-               return -ENOMEM;
-
-       status = __block_write_begin(page, pos, len, xfs_get_blocks);
-       if (xfs_mp_fail_writes(mp))
-               status = -EIO;
-       if (unlikely(status)) {
-               struct inode    *inode = mapping->host;
-               size_t          isize = i_size_read(inode);
-
-               xfs_vm_write_failed(inode, page, pos, len);
-               unlock_page(page);
-
-               /*
-                * If the write is beyond EOF, we only want to kill blocks
-                * allocated in this write, not blocks that were previously
-                * written successfully.
-                */
-               if (xfs_mp_fail_writes(mp))
-                       isize = 0;
-               if (pos + len > isize) {
-                       ssize_t start = max_t(ssize_t, pos, isize);
-
-                       truncate_pagecache_range(inode, start, pos + len);
-               }
-
-               put_page(page);
-               page = NULL;
-       }
-
-       *pagep = page;
-       return status;
-}
-
-/*
- * On failure, we only need to kill delalloc blocks beyond EOF in the range of
- * this specific write because they will never be written. Previous writes
- * beyond EOF where block allocation succeeded do not need to be trashed, so
- * only new blocks from this write should be trashed. For blocks within
- * EOF, generic_write_end() zeros them so they are safe to leave alone and be
- * written with all the other valid data.
- */
-STATIC int
-xfs_vm_write_end(
-       struct file             *file,
-       struct address_space    *mapping,
-       loff_t                  pos,
-       unsigned                len,
-       unsigned                copied,
-       struct page             *page,
-       void                    *fsdata)
-{
-       int                     ret;
-
-       ASSERT(len <= PAGE_SIZE);
-
-       ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-       if (unlikely(ret < len)) {
-               struct inode    *inode = mapping->host;
-               size_t          isize = i_size_read(inode);
-               loff_t          to = pos + len;
-
-               if (to > isize) {
-                       /* only kill blocks in this write beyond EOF */
-                       if (pos > isize)
-                               isize = pos;
-                       xfs_vm_kill_delalloc_range(inode, isize, to);
-                       truncate_pagecache_range(inode, isize, to);
-               }
-       }
-       return ret;
+       return -EINVAL;
  }
  
  STATIC sector_t
@@ -1747,8 +1514,6 @@ const struct address_space_operations xfs_address_space_operations = {
         .set_page_dirty         = xfs_vm_set_page_dirty,
         .releasepage            = xfs_vm_releasepage,
         .invalidatepage         = xfs_vm_invalidatepage,
-       .write_begin            = xfs_vm_write_begin,
-       .write_end              = xfs_vm_write_end,
         .bmap                   = xfs_vm_bmap,
         .direct_IO              = xfs_vm_direct_IO,
         .migratepage            = buffer_migrate_page,