Merge tag 'afs-next-20171113' of git://git.kernel.org/pub/scm/linux/kernel/git/dhowel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 16 Nov 2017 19:41:22 +0000 (11:41 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 16 Nov 2017 19:41:22 +0000 (11:41 -0800)
Pull AFS updates from David Howells:
 "kAFS filesystem driver overhaul.

  The major points of the overhaul are:

   (1) Preliminary groundwork is laid for supporting network-namespacing
       of kAFS. The remainder of the namespacing work requires some way
       to pass namespace information to submounts triggered by an
       automount. This requires something like the mount overhaul that's
       in progress.

   (2) sockaddr_rxrpc is used in preference to in_addr for holding
       addresses internally and add support for talking to the YFS VL
       server. With this, kAFS can do everything over IPv6 as well as
       IPv4 if it's talking to servers that support it.

   (3) Callback handling is overhauled to be generally passive rather
       than active. 'Callbacks' are promises by the server to tell us
       about data and metadata changes. Callbacks are now checked when
       we next touch an inode rather than actively going and looking for
       it where possible.

   (4) File access permit caching is overhauled to store the caching
       information per-inode rather than per-directory, shared over
       subordinate files. Whilst older AFS servers only allow ACLs on
       directories (shared to the files in that directory), newer AFS
       servers break that restriction.

       To improve memory usage and to make it easier to do mass-key
       removal, permit combinations are cached and shared.

   (5) Cell database management is overhauled to allow lighter locks to
       be used and to make cell records autonomous state machines that
       look after getting their own DNS records and cleaning themselves
       up, in particular preventing races in acquiring and relinquishing
       the fscache token for the cell.

   (6) Volume caching is overhauled. The afs_vlocation record is got rid
       of to simplify things and the superblock is now keyed on the cell
       and the numeric volume ID only. The volume record is tied to a
       superblock and normal superblock management is used to mediate
       the lifetime of the volume fscache token.

   (7) File server record caching is overhauled to make server records
       independent of cells and volumes. A server can be in multiple
       cells (in such a case, the administrator must make sure that the
       VL services for all cells correctly reflect the volumes shared
       between those cells).

       Server records are now indexed using the UUID of the server
       rather than the address since a server can have multiple
       addresses.

   (8) File server rotation is overhauled to handle VMOVED, VBUSY (and
       similar), VOFFLINE and VNOVOL indications and to handle rotation
       both of servers and addresses of those servers. The rotation will
       also wait and retry if the server says it is busy.

   (9) Data writeback is overhauled. Each inode no longer stores a list
       of modified sections tagged with the key that authorised it in
       favour of noting the modified region of a page in page->private
       and storing a list of keys that made modifications in the inode.

       This simplifies things and allows other keys to be used to
       actually write to the server if a key that made a modification
       becomes useless.

  (10) Writable mmap() is implemented. This allows a kernel to be build
       entirely on AFS.

  Note that Pre AFS-3.4 servers are no longer supported, though this can
  be added back if necessary (AFS-3.4 was released in 1998)"

* tag 'afs-next-20171113' of git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs: (35 commits)
  afs: Protect call->state changes against signals
  afs: Trace page dirty/clean
  afs: Implement shared-writeable mmap
  afs: Get rid of the afs_writeback record
  afs: Introduce a file-private data record
  afs: Use a dynamic port if 7001 is in use
  afs: Fix directory read/modify race
  afs: Trace the sending of pages
  afs: Trace the initiation and completion of client calls
  afs: Fix documentation on # vs % prefix in mount source specification
  afs: Fix total-length calculation for multiple-page send
  afs: Only progress call state at end of Tx phase from rxrpc callback
  afs: Make use of the YFS service upgrade to fully support IPv6
  afs: Overhaul volume and server record caching and fileserver rotation
  afs: Move server rotation code into its own file
  afs: Add an address list concept
  afs: Overhaul cell database management
  afs: Overhaul permit caching
  afs: Overhaul the callback handling
  afs: Rename struct afs_call server member to cm_server
  ...

1  2 
drivers/media/platform/qcom/venus/hfi.c
fs/afs/write.c
fs/btrfs/extent-tree.c
mm/filemap.c

index ba29fd4d49847cc3d1956eefd224823d42cdb14e,e374c7d1a618fc9673e5f1ddc63c8e27cff4d468..1baf78d3c02d09e4085ffe54623b2b79db5657a2
@@@ -88,12 -88,6 +88,6 @@@ unlock
        return ret;
  }
  
- static int core_deinit_wait_atomic_t(atomic_t *p)
- {
-       schedule();
-       return 0;
- }
  int hfi_core_deinit(struct venus_core *core, bool blocking)
  {
        int ret = 0, empty;
  
        if (!empty) {
                mutex_unlock(&core->lock);
-               wait_on_atomic_t(&core->insts_count, core_deinit_wait_atomic_t,
+               wait_on_atomic_t(&core->insts_count, atomic_t_wait,
                                 TASK_UNINTERRUPTIBLE);
                mutex_lock(&core->lock);
        }
@@@ -484,7 -478,6 +478,7 @@@ int hfi_session_process_buf(struct venu
  
        return -EINVAL;
  }
 +EXPORT_SYMBOL_GPL(hfi_session_process_buf);
  
  irqreturn_t hfi_isr_thread(int irq, void *dev_id)
  {
diff --combined fs/afs/write.c
index 11dd0526b96b69eb0276bfbfc823a83a9e8b47b9,4472882f06df92d91831f60086fe32606862970a..18e46e31523ccc0295a35c058baa88d0ebf960ed
@@@ -8,6 -8,7 +8,7 @@@
   * as published by the Free Software Foundation; either version
   * 2 of the License, or (at your option) any later version.
   */
  #include <linux/backing-dev.h>
  #include <linux/slab.h>
  #include <linux/fs.h>
@@@ -16,9 -17,6 +17,6 @@@
  #include <linux/pagevec.h>
  #include "internal.h"
  
- static int afs_write_back_from_locked_page(struct afs_writeback *wb,
-                                          struct page *page);
  /*
   * mark a page as having been made dirty and thus needing writeback
   */
@@@ -28,58 -26,6 +26,6 @@@ int afs_set_page_dirty(struct page *pag
        return __set_page_dirty_nobuffers(page);
  }
  
- /*
-  * unlink a writeback record because its usage has reached zero
-  * - must be called with the wb->vnode->writeback_lock held
-  */
- static void afs_unlink_writeback(struct afs_writeback *wb)
- {
-       struct afs_writeback *front;
-       struct afs_vnode *vnode = wb->vnode;
-       list_del_init(&wb->link);
-       if (!list_empty(&vnode->writebacks)) {
-               /* if an fsync rises to the front of the queue then wake it
-                * up */
-               front = list_entry(vnode->writebacks.next,
-                                  struct afs_writeback, link);
-               if (front->state == AFS_WBACK_SYNCING) {
-                       _debug("wake up sync");
-                       front->state = AFS_WBACK_COMPLETE;
-                       wake_up(&front->waitq);
-               }
-       }
- }
- /*
-  * free a writeback record
-  */
- static void afs_free_writeback(struct afs_writeback *wb)
- {
-       _enter("");
-       key_put(wb->key);
-       kfree(wb);
- }
- /*
-  * dispose of a reference to a writeback record
-  */
- void afs_put_writeback(struct afs_writeback *wb)
- {
-       struct afs_vnode *vnode = wb->vnode;
-       _enter("{%d}", wb->usage);
-       spin_lock(&vnode->writeback_lock);
-       if (--wb->usage == 0)
-               afs_unlink_writeback(wb);
-       else
-               wb = NULL;
-       spin_unlock(&vnode->writeback_lock);
-       if (wb)
-               afs_free_writeback(wb);
- }
  /*
   * partly or wholly fill a page that's under preparation for writing
   */
@@@ -103,7 -49,7 +49,7 @@@ static int afs_fill_page(struct afs_vno
        req->pages[0] = page;
        get_page(page);
  
-       ret = afs_vnode_fetch_data(vnode, key, req);
+       ret = afs_fetch_data(vnode, key, req);
        afs_put_read(req);
        if (ret < 0) {
                if (ret == -ENOENT) {
@@@ -125,42 -71,32 +71,32 @@@ int afs_write_begin(struct file *file, 
                    loff_t pos, unsigned len, unsigned flags,
                    struct page **pagep, void **fsdata)
  {
-       struct afs_writeback *candidate, *wb;
        struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
        struct page *page;
-       struct key *key = file->private_data;
-       unsigned from = pos & (PAGE_SIZE - 1);
-       unsigned to = from + len;
+       struct key *key = afs_file_key(file);
+       unsigned long priv;
+       unsigned f, from = pos & (PAGE_SIZE - 1);
+       unsigned t, to = from + len;
        pgoff_t index = pos >> PAGE_SHIFT;
        int ret;
  
        _enter("{%x:%u},{%lx},%u,%u",
               vnode->fid.vid, vnode->fid.vnode, index, from, to);
  
-       candidate = kzalloc(sizeof(*candidate), GFP_KERNEL);
-       if (!candidate)
-               return -ENOMEM;
-       candidate->vnode = vnode;
-       candidate->first = candidate->last = index;
-       candidate->offset_first = from;
-       candidate->to_last = to;
-       INIT_LIST_HEAD(&candidate->link);
-       candidate->usage = 1;
-       candidate->state = AFS_WBACK_PENDING;
-       init_waitqueue_head(&candidate->waitq);
+       /* We want to store information about how much of a page is altered in
+        * page->private.
+        */
+       BUILD_BUG_ON(PAGE_SIZE > 32768 && sizeof(page->private) < 8);
  
        page = grab_cache_page_write_begin(mapping, index, flags);
-       if (!page) {
-               kfree(candidate);
+       if (!page)
                return -ENOMEM;
-       }
  
        if (!PageUptodate(page) && len != PAGE_SIZE) {
                ret = afs_fill_page(vnode, key, pos & PAGE_MASK, PAGE_SIZE, page);
                if (ret < 0) {
                        unlock_page(page);
                        put_page(page);
-                       kfree(candidate);
                        _leave(" = %d [prep]", ret);
                        return ret;
                }
        *pagep = page;
  
  try_again:
-       spin_lock(&vnode->writeback_lock);
-       /* see if this page is already pending a writeback under a suitable key
-        * - if so we can just join onto that one */
-       wb = (struct afs_writeback *) page_private(page);
-       if (wb) {
-               if (wb->key == key && wb->state == AFS_WBACK_PENDING)
-                       goto subsume_in_current_wb;
-               goto flush_conflicting_wb;
+       /* See if this page is already partially written in a way that we can
+        * merge the new write with.
+        */
+       t = f = 0;
+       if (PagePrivate(page)) {
+               priv = page_private(page);
+               f = priv & AFS_PRIV_MAX;
+               t = priv >> AFS_PRIV_SHIFT;
+               ASSERTCMP(f, <=, t);
        }
  
-       if (index > 0) {
-               /* see if we can find an already pending writeback that we can
-                * append this page to */
-               list_for_each_entry(wb, &vnode->writebacks, link) {
-                       if (wb->last == index - 1 && wb->key == key &&
-                           wb->state == AFS_WBACK_PENDING)
-                               goto append_to_previous_wb;
-               }
+       if (f != t) {
+               if (to < f || from > t)
+                       goto flush_conflicting_write;
+               if (from < f)
+                       f = from;
+               if (to > t)
+                       t = to;
+       } else {
+               f = from;
+               t = to;
        }
  
-       list_add_tail(&candidate->link, &vnode->writebacks);
-       candidate->key = key_get(key);
-       spin_unlock(&vnode->writeback_lock);
-       SetPagePrivate(page);
-       set_page_private(page, (unsigned long) candidate);
-       _leave(" = 0 [new]");
-       return 0;
- subsume_in_current_wb:
-       _debug("subsume");
-       ASSERTRANGE(wb->first, <=, index, <=, wb->last);
-       if (index == wb->first && from < wb->offset_first)
-               wb->offset_first = from;
-       if (index == wb->last && to > wb->to_last)
-               wb->to_last = to;
-       spin_unlock(&vnode->writeback_lock);
-       kfree(candidate);
-       _leave(" = 0 [sub]");
-       return 0;
- append_to_previous_wb:
-       _debug("append into %lx-%lx", wb->first, wb->last);
-       wb->usage++;
-       wb->last++;
-       wb->to_last = to;
-       spin_unlock(&vnode->writeback_lock);
+       priv = (unsigned long)t << AFS_PRIV_SHIFT;
+       priv |= f;
+       trace_afs_page_dirty(vnode, tracepoint_string("begin"),
+                            page->index, priv);
        SetPagePrivate(page);
-       set_page_private(page, (unsigned long) wb);
-       kfree(candidate);
-       _leave(" = 0 [app]");
+       set_page_private(page, priv);
+       _leave(" = 0");
        return 0;
  
-       /* the page is currently bound to another context, so if it's dirty we
-        * need to flush it before we can use the new context */
- flush_conflicting_wb:
+       /* The previous write and this write aren't adjacent or overlapping, so
+        * flush the page out.
+        */
+ flush_conflicting_write:
        _debug("flush conflict");
-       if (wb->state == AFS_WBACK_PENDING)
-               wb->state = AFS_WBACK_CONFLICTING;
-       spin_unlock(&vnode->writeback_lock);
-       if (clear_page_dirty_for_io(page)) {
-               ret = afs_write_back_from_locked_page(wb, page);
-               if (ret < 0) {
-                       afs_put_writeback(candidate);
-                       _leave(" = %d", ret);
-                       return ret;
-               }
+       ret = write_one_page(page);
+       if (ret < 0) {
+               _leave(" = %d", ret);
+               return ret;
        }
  
-       /* the page holds a ref on the writeback record */
-       afs_put_writeback(wb);
-       set_page_private(page, 0);
-       ClearPagePrivate(page);
+       ret = lock_page_killable(page);
+       if (ret < 0) {
+               _leave(" = %d", ret);
+               return ret;
+       }
        goto try_again;
  }
  
@@@ -255,7 -166,7 +166,7 @@@ int afs_write_end(struct file *file, st
                  struct page *page, void *fsdata)
  {
        struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
-       struct key *key = file->private_data;
+       struct key *key = afs_file_key(file);
        loff_t i_size, maybe_i_size;
        int ret;
  
  
        i_size = i_size_read(&vnode->vfs_inode);
        if (maybe_i_size > i_size) {
-               spin_lock(&vnode->writeback_lock);
+               spin_lock(&vnode->wb_lock);
                i_size = i_size_read(&vnode->vfs_inode);
                if (maybe_i_size > i_size)
                        i_size_write(&vnode->vfs_inode, maybe_i_size);
-               spin_unlock(&vnode->writeback_lock);
+               spin_unlock(&vnode->wb_lock);
        }
  
        if (!PageUptodate(page)) {
  /*
   * kill all the pages in the given range
   */
- static void afs_kill_pages(struct afs_vnode *vnode, bool error,
+ static void afs_kill_pages(struct address_space *mapping,
                           pgoff_t first, pgoff_t last)
  {
+       struct afs_vnode *vnode = AFS_FS_I(mapping->host);
        struct pagevec pv;
        unsigned count, loop;
  
        _enter("{%x:%u},%lx-%lx",
               vnode->fid.vid, vnode->fid.vnode, first, last);
  
 -      pagevec_init(&pv, 0);
 +      pagevec_init(&pv);
  
        do {
                _debug("kill %lx-%lx", first, last);
                count = last - first + 1;
                if (count > PAGEVEC_SIZE)
                        count = PAGEVEC_SIZE;
-               pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
-                                             first, count, pv.pages);
+               pv.nr = find_get_pages_contig(mapping, first, count, pv.pages);
                ASSERTCMP(pv.nr, ==, count);
  
                for (loop = 0; loop < count; loop++) {
                        struct page *page = pv.pages[loop];
                        ClearPageUptodate(page);
-                       if (error)
-                               SetPageError(page);
-                       if (PageWriteback(page))
-                               end_page_writeback(page);
+                       SetPageError(page);
+                       end_page_writeback(page);
                        if (page->index >= first)
                                first = page->index + 1;
+                       lock_page(page);
+                       generic_error_remove_page(mapping, page);
                }
  
                __pagevec_release(&pv);
-       } while (first < last);
+       } while (first <= last);
  
        _leave("");
  }
  
  /*
-  * synchronously write back the locked page and any subsequent non-locked dirty
-  * pages also covered by the same writeback record
+  * Redirty all the pages in a given range.
+  */
+ static void afs_redirty_pages(struct writeback_control *wbc,
+                             struct address_space *mapping,
+                             pgoff_t first, pgoff_t last)
+ {
+       struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+       struct pagevec pv;
+       unsigned count, loop;
+       _enter("{%x:%u},%lx-%lx",
+              vnode->fid.vid, vnode->fid.vnode, first, last);
 -      pagevec_init(&pv, 0);
++      pagevec_init(&pv);
+       do {
+               _debug("redirty %lx-%lx", first, last);
+               count = last - first + 1;
+               if (count > PAGEVEC_SIZE)
+                       count = PAGEVEC_SIZE;
+               pv.nr = find_get_pages_contig(mapping, first, count, pv.pages);
+               ASSERTCMP(pv.nr, ==, count);
+               for (loop = 0; loop < count; loop++) {
+                       struct page *page = pv.pages[loop];
+                       redirty_page_for_writepage(wbc, page);
+                       end_page_writeback(page);
+                       if (page->index >= first)
+                               first = page->index + 1;
+               }
+               __pagevec_release(&pv);
+       } while (first <= last);
+       _leave("");
+ }
+ /*
+  * write to a file
+  */
+ static int afs_store_data(struct address_space *mapping,
+                         pgoff_t first, pgoff_t last,
+                         unsigned offset, unsigned to)
+ {
+       struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+       struct afs_fs_cursor fc;
+       struct afs_wb_key *wbk = NULL;
+       struct list_head *p;
+       int ret = -ENOKEY, ret2;
+       _enter("%s{%x:%u.%u},%lx,%lx,%x,%x",
+              vnode->volume->name,
+              vnode->fid.vid,
+              vnode->fid.vnode,
+              vnode->fid.unique,
+              first, last, offset, to);
+       spin_lock(&vnode->wb_lock);
+       p = vnode->wb_keys.next;
+       /* Iterate through the list looking for a valid key to use. */
+ try_next_key:
+       while (p != &vnode->wb_keys) {
+               wbk = list_entry(p, struct afs_wb_key, vnode_link);
+               _debug("wbk %u", key_serial(wbk->key));
+               ret2 = key_validate(wbk->key);
+               if (ret2 == 0)
+                       goto found_key;
+               if (ret == -ENOKEY)
+                       ret = ret2;
+               p = p->next;
+       }
+       spin_unlock(&vnode->wb_lock);
+       afs_put_wb_key(wbk);
+       _leave(" = %d [no keys]", ret);
+       return ret;
+ found_key:
+       refcount_inc(&wbk->usage);
+       spin_unlock(&vnode->wb_lock);
+       _debug("USE WB KEY %u", key_serial(wbk->key));
+       ret = -ERESTARTSYS;
+       if (afs_begin_vnode_operation(&fc, vnode, wbk->key)) {
+               while (afs_select_fileserver(&fc)) {
+                       fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+                       afs_fs_store_data(&fc, mapping, first, last, offset, to);
+               }
+               afs_check_for_remote_deletion(&fc, fc.vnode);
+               afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+               ret = afs_end_vnode_operation(&fc);
+       }
+       switch (ret) {
+       case -EACCES:
+       case -EPERM:
+       case -ENOKEY:
+       case -EKEYEXPIRED:
+       case -EKEYREJECTED:
+       case -EKEYREVOKED:
+               _debug("next");
+               spin_lock(&vnode->wb_lock);
+               p = wbk->vnode_link.next;
+               afs_put_wb_key(wbk);
+               goto try_next_key;
+       }
+       afs_put_wb_key(wbk);
+       _leave(" = %d", ret);
+       return ret;
+ }
+ /*
+  * Synchronously write back the locked page and any subsequent non-locked dirty
+  * pages.
   */
- static int afs_write_back_from_locked_page(struct afs_writeback *wb,
-                                          struct page *primary_page)
+ static int afs_write_back_from_locked_page(struct address_space *mapping,
+                                          struct writeback_control *wbc,
+                                          struct page *primary_page,
+                                          pgoff_t final_page)
  {
+       struct afs_vnode *vnode = AFS_FS_I(mapping->host);
        struct page *pages[8], *page;
-       unsigned long count;
-       unsigned n, offset, to;
+       unsigned long count, priv;
+       unsigned n, offset, to, f, t;
        pgoff_t start, first, last;
        int loop, ret;
  
        if (test_set_page_writeback(primary_page))
                BUG();
  
-       /* find all consecutive lockable dirty pages, stopping when we find a
-        * page that is not immediately lockable, is not dirty or is missing,
-        * or we reach the end of the range */
+       /* Find all consecutive lockable dirty pages that have contiguous
+        * written regions, stopping when we find a page that is not
+        * immediately lockable, is not dirty or is missing, or we reach the
+        * end of the range.
+        */
        start = primary_page->index;
-       if (start >= wb->last)
+       priv = page_private(primary_page);
+       offset = priv & AFS_PRIV_MAX;
+       to = priv >> AFS_PRIV_SHIFT;
+       trace_afs_page_dirty(vnode, tracepoint_string("store"),
+                            primary_page->index, priv);
+       WARN_ON(offset == to);
+       if (offset == to)
+               trace_afs_page_dirty(vnode, tracepoint_string("WARN"),
+                                    primary_page->index, priv);
+       if (start >= final_page || to < PAGE_SIZE)
                goto no_more;
        start++;
        do {
                _debug("more %lx [%lx]", start, count);
-               n = wb->last - start + 1;
+               n = final_page - start + 1;
                if (n > ARRAY_SIZE(pages))
                        n = ARRAY_SIZE(pages);
-               n = find_get_pages_contig(wb->vnode->vfs_inode.i_mapping,
-                                         start, n, pages);
+               n = find_get_pages_contig(mapping, start, ARRAY_SIZE(pages), pages);
                _debug("fgpc %u", n);
                if (n == 0)
                        goto no_more;
                }
  
                for (loop = 0; loop < n; loop++) {
+                       if (to != PAGE_SIZE)
+                               break;
                        page = pages[loop];
-                       if (page->index > wb->last)
+                       if (page->index > final_page)
                                break;
                        if (!trylock_page(page))
                                break;
-                       if (!PageDirty(page) ||
-                           page_private(page) != (unsigned long) wb) {
+                       if (!PageDirty(page) || PageWriteback(page)) {
                                unlock_page(page);
                                break;
                        }
+                       priv = page_private(page);
+                       f = priv & AFS_PRIV_MAX;
+                       t = priv >> AFS_PRIV_SHIFT;
+                       if (f != 0) {
+                               unlock_page(page);
+                               break;
+                       }
+                       to = t;
+                       trace_afs_page_dirty(vnode, tracepoint_string("store+"),
+                                            page->index, priv);
                        if (!clear_page_dirty_for_io(page))
                                BUG();
                        if (test_set_page_writeback(page))
                }
  
                start += loop;
-       } while (start <= wb->last && count < 65536);
+       } while (start <= final_page && count < 65536);
  
  no_more:
-       /* we now have a contiguous set of dirty pages, each with writeback set
-        * and the dirty mark cleared; the first page is locked and must remain
-        * so, all the rest are unlocked */
+       /* We now have a contiguous set of dirty pages, each with writeback
+        * set; the first page is still locked at this point, but all the rest
+        * have been unlocked.
+        */
+       unlock_page(primary_page);
        first = primary_page->index;
        last = first + count - 1;
  
-       offset = (first == wb->first) ? wb->offset_first : 0;
-       to = (last == wb->last) ? wb->to_last : PAGE_SIZE;
        _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to);
  
-       ret = afs_vnode_store_data(wb, first, last, offset, to);
-       if (ret < 0) {
-               switch (ret) {
-               case -EDQUOT:
-               case -ENOSPC:
-                       mapping_set_error(wb->vnode->vfs_inode.i_mapping, -ENOSPC);
-                       break;
-               case -EROFS:
-               case -EIO:
-               case -EREMOTEIO:
-               case -EFBIG:
-               case -ENOENT:
-               case -ENOMEDIUM:
-               case -ENXIO:
-                       afs_kill_pages(wb->vnode, true, first, last);
-                       mapping_set_error(wb->vnode->vfs_inode.i_mapping, -EIO);
-                       break;
-               case -EACCES:
-               case -EPERM:
-               case -ENOKEY:
-               case -EKEYEXPIRED:
-               case -EKEYREJECTED:
-               case -EKEYREVOKED:
-                       afs_kill_pages(wb->vnode, false, first, last);
-                       break;
-               default:
-                       break;
-               }
-       } else {
+       ret = afs_store_data(mapping, first, last, offset, to);
+       switch (ret) {
+       case 0:
                ret = count;
+               break;
+       default:
+               pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret);
+               /* Fall through */
+       case -EACCES:
+       case -EPERM:
+       case -ENOKEY:
+       case -EKEYEXPIRED:
+       case -EKEYREJECTED:
+       case -EKEYREVOKED:
+               afs_redirty_pages(wbc, mapping, first, last);
+               mapping_set_error(mapping, ret);
+               break;
+       case -EDQUOT:
+       case -ENOSPC:
+               afs_redirty_pages(wbc, mapping, first, last);
+               mapping_set_error(mapping, -ENOSPC);
+               break;
+       case -EROFS:
+       case -EIO:
+       case -EREMOTEIO:
+       case -EFBIG:
+       case -ENOENT:
+       case -ENOMEDIUM:
+       case -ENXIO:
+               afs_kill_pages(mapping, first, last);
+               mapping_set_error(mapping, ret);
+               break;
        }
  
        _leave(" = %d", ret);
   */
  int afs_writepage(struct page *page, struct writeback_control *wbc)
  {
-       struct afs_writeback *wb;
        int ret;
  
        _enter("{%lx},", page->index);
  
-       wb = (struct afs_writeback *) page_private(page);
-       ASSERT(wb != NULL);
-       ret = afs_write_back_from_locked_page(wb, page);
-       unlock_page(page);
+       ret = afs_write_back_from_locked_page(page->mapping, wbc, page,
+                                             wbc->range_end >> PAGE_SHIFT);
        if (ret < 0) {
                _leave(" = %d", ret);
                return 0;
@@@ -490,26 -550,37 +550,30 @@@ static int afs_writepages_region(struc
                                 struct writeback_control *wbc,
                                 pgoff_t index, pgoff_t end, pgoff_t *_next)
  {
-       struct afs_writeback *wb;
        struct page *page;
        int ret, n;
  
        _enter(",,%lx,%lx,", index, end);
  
        do {
 -              n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY,
 -                                     1, &page);
 +              n = find_get_pages_range_tag(mapping, &index, end,
 +                                      PAGECACHE_TAG_DIRTY, 1, &page);
                if (!n)
                        break;
  
                _debug("wback %lx", page->index);
  
 -              if (page->index > end) {
 -                      *_next = index;
 -                      put_page(page);
 -                      _leave(" = 0 [%lx]", *_next);
 -                      return 0;
 -              }
 -
                /* at this point we hold neither mapping->tree_lock nor lock on
                 * the page itself: the page may be truncated or invalidated
                 * (changing page->mapping to NULL), or even swizzled back from
                 * swapper_space to tmpfs file mapping
                 */
-               lock_page(page);
+               ret = lock_page_killable(page);
+               if (ret < 0) {
+                       put_page(page);
+                       _leave(" = %d", ret);
+                       return ret;
+               }
  
                if (page->mapping != mapping || !PageDirty(page)) {
                        unlock_page(page);
                        continue;
                }
  
-               wb = (struct afs_writeback *) page_private(page);
-               ASSERT(wb != NULL);
-               spin_lock(&wb->vnode->writeback_lock);
-               wb->state = AFS_WBACK_WRITING;
-               spin_unlock(&wb->vnode->writeback_lock);
                if (!clear_page_dirty_for_io(page))
                        BUG();
-               ret = afs_write_back_from_locked_page(wb, page);
-               unlock_page(page);
+               ret = afs_write_back_from_locked_page(mapping, wbc, page, end);
                put_page(page);
                if (ret < 0) {
                        _leave(" = %d", ret);
@@@ -591,18 -654,15 +647,15 @@@ int afs_writepages(struct address_spac
   */
  void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
  {
-       struct afs_writeback *wb = call->wb;
        struct pagevec pv;
+       unsigned long priv;
        unsigned count, loop;
        pgoff_t first = call->first, last = call->last;
-       bool free_wb;
  
        _enter("{%x:%u},{%lx-%lx}",
               vnode->fid.vid, vnode->fid.vnode, first, last);
  
-       ASSERT(wb != NULL);
 -      pagevec_init(&pv, 0);
 +      pagevec_init(&pv);
  
        do {
                _debug("done %lx-%lx", first, last);
                count = last - first + 1;
                if (count > PAGEVEC_SIZE)
                        count = PAGEVEC_SIZE;
-               pv.nr = find_get_pages_contig(call->mapping, first, count,
-                                             pv.pages);
+               pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
+                                             first, count, pv.pages);
                ASSERTCMP(pv.nr, ==, count);
  
-               spin_lock(&vnode->writeback_lock);
                for (loop = 0; loop < count; loop++) {
-                       struct page *page = pv.pages[loop];
-                       end_page_writeback(page);
-                       if (page_private(page) == (unsigned long) wb) {
-                               set_page_private(page, 0);
-                               ClearPagePrivate(page);
-                               wb->usage--;
-                       }
-               }
-               free_wb = false;
-               if (wb->usage == 0) {
-                       afs_unlink_writeback(wb);
-                       free_wb = true;
+                       priv = page_private(pv.pages[loop]);
+                       trace_afs_page_dirty(vnode, tracepoint_string("clear"),
+                                            pv.pages[loop]->index, priv);
+                       set_page_private(pv.pages[loop], 0);
+                       end_page_writeback(pv.pages[loop]);
                }
-               spin_unlock(&vnode->writeback_lock);
                first += count;
-               if (free_wb) {
-                       afs_free_writeback(wb);
-                       wb = NULL;
-               }
                __pagevec_release(&pv);
        } while (first <= last);
  
+       afs_prune_wb_keys(vnode);
        _leave("");
  }
  
@@@ -669,28 -716,6 +709,6 @@@ ssize_t afs_file_write(struct kiocb *io
        return result;
  }
  
- /*
-  * flush the vnode to the fileserver
-  */
- int afs_writeback_all(struct afs_vnode *vnode)
- {
-       struct address_space *mapping = vnode->vfs_inode.i_mapping;
-       struct writeback_control wbc = {
-               .sync_mode      = WB_SYNC_ALL,
-               .nr_to_write    = LONG_MAX,
-               .range_cyclic   = 1,
-       };
-       int ret;
-       _enter("");
-       ret = mapping->a_ops->writepages(mapping, &wbc);
-       __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-       _leave(" = %d", ret);
-       return ret;
- }
  /*
   * flush any dirty pages for this process, and check for write errors.
   * - the return status from this call provides a reliable indication of
  int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
  {
        struct inode *inode = file_inode(file);
-       struct afs_writeback *wb, *xwb;
        struct afs_vnode *vnode = AFS_FS_I(inode);
-       int ret;
  
        _enter("{%x:%u},{n=%pD},%d",
               vnode->fid.vid, vnode->fid.vnode, file,
               datasync);
  
-       ret = file_write_and_wait_range(file, start, end);
-       if (ret)
-               return ret;
-       inode_lock(inode);
-       /* use a writeback record as a marker in the queue - when this reaches
-        * the front of the queue, all the outstanding writes are either
-        * completed or rejected */
-       wb = kzalloc(sizeof(*wb), GFP_KERNEL);
-       if (!wb) {
-               ret = -ENOMEM;
-               goto out;
-       }
-       wb->vnode = vnode;
-       wb->first = 0;
-       wb->last = -1;
-       wb->offset_first = 0;
-       wb->to_last = PAGE_SIZE;
-       wb->usage = 1;
-       wb->state = AFS_WBACK_SYNCING;
-       init_waitqueue_head(&wb->waitq);
-       spin_lock(&vnode->writeback_lock);
-       list_for_each_entry(xwb, &vnode->writebacks, link) {
-               if (xwb->state == AFS_WBACK_PENDING)
-                       xwb->state = AFS_WBACK_CONFLICTING;
-       }
-       list_add_tail(&wb->link, &vnode->writebacks);
-       spin_unlock(&vnode->writeback_lock);
-       /* push all the outstanding writebacks to the server */
-       ret = afs_writeback_all(vnode);
-       if (ret < 0) {
-               afs_put_writeback(wb);
-               _leave(" = %d [wb]", ret);
-               goto out;
-       }
-       /* wait for the preceding writes to actually complete */
-       ret = wait_event_interruptible(wb->waitq,
-                                      wb->state == AFS_WBACK_COMPLETE ||
-                                      vnode->writebacks.next == &wb->link);
-       afs_put_writeback(wb);
-       _leave(" = %d", ret);
- out:
-       inode_unlock(inode);
-       return ret;
+       return file_write_and_wait_range(file, start, end);
  }
  
  /*
@@@ -774,19 -751,114 +744,114 @@@ int afs_flush(struct file *file, fl_own
   * notification that a previously read-only page is about to become writable
   * - if it returns an error, the caller will deliver a bus error signal
   */
- int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+ int afs_page_mkwrite(struct vm_fault *vmf)
  {
-       struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host);
+       struct file *file = vmf->vma->vm_file;
+       struct inode *inode = file_inode(file);
+       struct afs_vnode *vnode = AFS_FS_I(inode);
+       unsigned long priv;
  
        _enter("{{%x:%u}},{%lx}",
-              vnode->fid.vid, vnode->fid.vnode, page->index);
+              vnode->fid.vid, vnode->fid.vnode, vmf->page->index);
+       sb_start_pagefault(inode->i_sb);
  
-       /* wait for the page to be written to the cache before we allow it to
-        * be modified */
+       /* Wait for the page to be written to the cache before we allow it to
+        * be modified.  We then assume the entire page will need writing back.
+        */
  #ifdef CONFIG_AFS_FSCACHE
-       fscache_wait_on_page_write(vnode->cache, page);
+       fscache_wait_on_page_write(vnode->cache, vmf->page);
  #endif
  
-       _leave(" = 0");
-       return 0;
+       if (PageWriteback(vmf->page) &&
+           wait_on_page_bit_killable(vmf->page, PG_writeback) < 0)
+               return VM_FAULT_RETRY;
+       if (lock_page_killable(vmf->page) < 0)
+               return VM_FAULT_RETRY;
+       /* We mustn't change page->private until writeback is complete as that
+        * details the portion of the page we need to write back and we might
+        * need to redirty the page if there's a problem.
+        */
+       wait_on_page_writeback(vmf->page);
+       priv = (unsigned long)PAGE_SIZE << AFS_PRIV_SHIFT; /* To */
+       priv |= 0; /* From */
+       trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"),
+                            vmf->page->index, priv);
+       SetPagePrivate(vmf->page);
+       set_page_private(vmf->page, priv);
+       sb_end_pagefault(inode->i_sb);
+       return VM_FAULT_LOCKED;
+ }
+ /*
+  * Prune the keys cached for writeback.  The caller must hold vnode->wb_lock.
+  */
+ void afs_prune_wb_keys(struct afs_vnode *vnode)
+ {
+       LIST_HEAD(graveyard);
+       struct afs_wb_key *wbk, *tmp;
+       /* Discard unused keys */
+       spin_lock(&vnode->wb_lock);
+       if (!mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_WRITEBACK) &&
+           !mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_DIRTY)) {
+               list_for_each_entry_safe(wbk, tmp, &vnode->wb_keys, vnode_link) {
+                       if (refcount_read(&wbk->usage) == 1)
+                               list_move(&wbk->vnode_link, &graveyard);
+               }
+       }
+       spin_unlock(&vnode->wb_lock);
+       while (!list_empty(&graveyard)) {
+               wbk = list_entry(graveyard.next, struct afs_wb_key, vnode_link);
+               list_del(&wbk->vnode_link);
+               afs_put_wb_key(wbk);
+       }
+ }
+ /*
+  * Clean up a page during invalidation.
+  */
+ int afs_launder_page(struct page *page)
+ {
+       struct address_space *mapping = page->mapping;
+       struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+       unsigned long priv;
+       unsigned int f, t;
+       int ret = 0;
+       _enter("{%lx}", page->index);
+       priv = page_private(page);
+       if (clear_page_dirty_for_io(page)) {
+               f = 0;
+               t = PAGE_SIZE;
+               if (PagePrivate(page)) {
+                       f = priv & AFS_PRIV_MAX;
+                       t = priv >> AFS_PRIV_SHIFT;
+               }
+               trace_afs_page_dirty(vnode, tracepoint_string("launder"),
+                                    page->index, priv);
+               ret = afs_store_data(mapping, page->index, page->index, t, f);
+       }
+       trace_afs_page_dirty(vnode, tracepoint_string("laundered"),
+                            page->index, priv);
+       set_page_private(page, 0);
+       ClearPagePrivate(page);
+ #ifdef CONFIG_AFS_FSCACHE
+       if (PageFsCache(page)) {
+               fscache_wait_on_page_write(vnode->cache, page);
+               fscache_uncache_page(vnode->cache, page);
+       }
+ #endif
+       return ret;
  }
diff --combined fs/btrfs/extent-tree.c
index 673ac4e01dd07ddb788805a9fad631a2656b5d7f,24cefde30e30b1ffe58284216a64c6f4ecec2096..7208ecef70889833ac2caa7d3d5d8b4b634a4ee0
@@@ -26,7 -26,6 +26,7 @@@
  #include <linux/slab.h>
  #include <linux/ratelimit.h>
  #include <linux/percpu_counter.h>
 +#include <linux/lockdep.h>
  #include "hash.h"
  #include "tree-log.h"
  #include "disk-io.h"
@@@ -39,7 -38,6 +39,7 @@@
  #include "math.h"
  #include "sysfs.h"
  #include "qgroup.h"
 +#include "ref-verify.h"
  
  #undef SCRAMBLE_DELAYED_REFS
  
@@@ -63,6 -61,9 +63,6 @@@ enum 
        CHUNK_ALLOC_FORCE = 2,
  };
  
 -static int update_block_group(struct btrfs_trans_handle *trans,
 -                            struct btrfs_fs_info *fs_info, u64 bytenr,
 -                            u64 num_bytes, int alloc);
  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                               struct btrfs_fs_info *fs_info,
                                struct btrfs_delayed_ref_node *node, u64 parent,
@@@ -90,8 -91,17 +90,8 @@@ static int find_next_key(struct btrfs_p
  static void dump_space_info(struct btrfs_fs_info *fs_info,
                            struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups);
 -static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
 -                                  u64 ram_bytes, u64 num_bytes, int delalloc);
 -static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
 -                                   u64 num_bytes, int delalloc);
  static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
                               u64 num_bytes);
 -static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 -                                  struct btrfs_space_info *space_info,
 -                                  u64 orig_bytes,
 -                                  enum btrfs_reserve_flush_enum flush,
 -                                  bool system_chunk);
  static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
                                     struct btrfs_space_info *space_info,
                                     u64 num_bytes);
@@@ -642,7 -652,7 +642,7 @@@ static int cache_block_group(struct btr
        cache->cached = BTRFS_CACHE_FAST;
        spin_unlock(&cache->lock);
  
 -      if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
 +      if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
                mutex_lock(&caching_ctl->mutex);
                ret = load_free_space_cache(fs_info, cache);
  
@@@ -913,7 -923,7 +913,7 @@@ search_again
        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
        if (head) {
                if (!mutex_trylock(&head->mutex)) {
 -                      refcount_inc(&head->node.refs);
 +                      refcount_inc(&head->refs);
                        spin_unlock(&delayed_refs->lock);
  
                        btrfs_release_path(path);
                         */
                        mutex_lock(&head->mutex);
                        mutex_unlock(&head->mutex);
 -                      btrfs_put_delayed_ref(&head->node);
 +                      btrfs_put_delayed_ref_head(head);
                        goto search_again;
                }
                spin_lock(&head->lock);
                else
                        BUG_ON(num_refs == 0);
  
 -              num_refs += head->node.ref_mod;
 +              num_refs += head->ref_mod;
                spin_unlock(&head->lock);
                mutex_unlock(&head->mutex);
        }
@@@ -2179,20 -2189,16 +2179,20 @@@ int btrfs_discard_extent(struct btrfs_f
  
  /* Can return -ENOMEM */
  int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 -                       struct btrfs_fs_info *fs_info,
 +                       struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
                         u64 root_objectid, u64 owner, u64 offset)
  {
 +      struct btrfs_fs_info *fs_info = root->fs_info;
        int old_ref_mod, new_ref_mod;
        int ret;
  
        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
               root_objectid == BTRFS_TREE_LOG_OBJECTID);
  
 +      btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
 +                         owner, offset, BTRFS_ADD_DELAYED_REF);
 +
        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
                                                 num_bytes, parent,
@@@ -2338,7 -2344,7 +2338,7 @@@ static void __run_delayed_extent_op(str
  
  static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
                                 struct btrfs_fs_info *fs_info,
 -                               struct btrfs_delayed_ref_node *node,
 +                               struct btrfs_delayed_ref_head *head,
                                 struct btrfs_delayed_extent_op *extent_op)
  {
        struct btrfs_key key;
        if (!path)
                return -ENOMEM;
  
 -      key.objectid = node->bytenr;
 +      key.objectid = head->bytenr;
  
        if (metadata) {
                key.type = BTRFS_METADATA_ITEM_KEY;
                key.offset = extent_op->level;
        } else {
                key.type = BTRFS_EXTENT_ITEM_KEY;
 -              key.offset = node->num_bytes;
 +              key.offset = head->num_bytes;
        }
  
  again:
                                path->slots[0]--;
                                btrfs_item_key_to_cpu(path->nodes[0], &key,
                                                      path->slots[0]);
 -                              if (key.objectid == node->bytenr &&
 +                              if (key.objectid == head->bytenr &&
                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
 -                                  key.offset == node->num_bytes)
 +                                  key.offset == head->num_bytes)
                                        ret = 0;
                        }
                        if (ret > 0) {
                                btrfs_release_path(path);
                                metadata = 0;
  
 -                              key.objectid = node->bytenr;
 -                              key.offset = node->num_bytes;
 +                              key.objectid = head->bytenr;
 +                              key.offset = head->num_bytes;
                                key.type = BTRFS_EXTENT_ITEM_KEY;
                                goto again;
                        }
@@@ -2501,6 -2507,44 +2501,6 @@@ static int run_one_delayed_ref(struct b
                return 0;
        }
  
 -      if (btrfs_delayed_ref_is_head(node)) {
 -              struct btrfs_delayed_ref_head *head;
 -              /*
 -               * we've hit the end of the chain and we were supposed
 -               * to insert this extent into the tree.  But, it got
 -               * deleted before we ever needed to insert it, so all
 -               * we have to do is clean up the accounting
 -               */
 -              BUG_ON(extent_op);
 -              head = btrfs_delayed_node_to_head(node);
 -              trace_run_delayed_ref_head(fs_info, node, head, node->action);
 -
 -              if (head->total_ref_mod < 0) {
 -                      struct btrfs_block_group_cache *cache;
 -
 -                      cache = btrfs_lookup_block_group(fs_info, node->bytenr);
 -                      ASSERT(cache);
 -                      percpu_counter_add(&cache->space_info->total_bytes_pinned,
 -                                         -node->num_bytes);
 -                      btrfs_put_block_group(cache);
 -              }
 -
 -              if (insert_reserved) {
 -                      btrfs_pin_extent(fs_info, node->bytenr,
 -                                       node->num_bytes, 1);
 -                      if (head->is_data) {
 -                              ret = btrfs_del_csums(trans, fs_info,
 -                                                    node->bytenr,
 -                                                    node->num_bytes);
 -                      }
 -              }
 -
 -              /* Also free its reserved qgroup space */
 -              btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
 -                                            head->qgroup_reserved);
 -              return ret;
 -      }
 -
        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
                ret = run_delayed_tree_ref(trans, fs_info, node, extent_op,
@@@ -2519,7 -2563,7 +2519,7 @@@ select_delayed_ref(struct btrfs_delayed
  {
        struct btrfs_delayed_ref_node *ref;
  
 -      if (list_empty(&head->ref_list))
 +      if (RB_EMPTY_ROOT(&head->ref_tree))
                return NULL;
  
        /*
                return list_first_entry(&head->ref_add_list,
                                struct btrfs_delayed_ref_node, add_list);
  
 -      ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
 -                             list);
 +      ref = rb_entry(rb_first(&head->ref_tree),
 +                     struct btrfs_delayed_ref_node, ref_node);
        ASSERT(list_empty(&ref->add_list));
        return ref;
  }
  
 +static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
 +                                    struct btrfs_delayed_ref_head *head)
 +{
 +      spin_lock(&delayed_refs->lock);
 +      head->processing = 0;
 +      delayed_refs->num_heads_ready++;
 +      spin_unlock(&delayed_refs->lock);
 +      btrfs_delayed_ref_unlock(head);
 +}
 +
 +static int cleanup_extent_op(struct btrfs_trans_handle *trans,
 +                           struct btrfs_fs_info *fs_info,
 +                           struct btrfs_delayed_ref_head *head)
 +{
 +      struct btrfs_delayed_extent_op *extent_op = head->extent_op;
 +      int ret;
 +
 +      if (!extent_op)
 +              return 0;
 +      head->extent_op = NULL;
 +      if (head->must_insert_reserved) {
 +              btrfs_free_delayed_extent_op(extent_op);
 +              return 0;
 +      }
 +      spin_unlock(&head->lock);
 +      ret = run_delayed_extent_op(trans, fs_info, head, extent_op);
 +      btrfs_free_delayed_extent_op(extent_op);
 +      return ret ? ret : 1;
 +}
 +
 +static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 +                          struct btrfs_fs_info *fs_info,
 +                          struct btrfs_delayed_ref_head *head)
 +{
 +      struct btrfs_delayed_ref_root *delayed_refs;
 +      int ret;
 +
 +      delayed_refs = &trans->transaction->delayed_refs;
 +
 +      ret = cleanup_extent_op(trans, fs_info, head);
 +      if (ret < 0) {
 +              unselect_delayed_ref_head(delayed_refs, head);
 +              btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
 +              return ret;
 +      } else if (ret) {
 +              return ret;
 +      }
 +
 +      /*
 +       * Need to drop our head ref lock and re-acquire the delayed ref lock
 +       * and then re-check to make sure nobody got added.
 +       */
 +      spin_unlock(&head->lock);
 +      spin_lock(&delayed_refs->lock);
 +      spin_lock(&head->lock);
 +      if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
 +              spin_unlock(&head->lock);
 +              spin_unlock(&delayed_refs->lock);
 +              return 1;
 +      }
 +      delayed_refs->num_heads--;
 +      rb_erase(&head->href_node, &delayed_refs->href_root);
 +      RB_CLEAR_NODE(&head->href_node);
 +      spin_unlock(&delayed_refs->lock);
 +      spin_unlock(&head->lock);
 +      atomic_dec(&delayed_refs->num_entries);
 +
 +      trace_run_delayed_ref_head(fs_info, head, 0);
 +
 +      if (head->total_ref_mod < 0) {
 +              struct btrfs_block_group_cache *cache;
 +
 +              cache = btrfs_lookup_block_group(fs_info, head->bytenr);
 +              ASSERT(cache);
 +              percpu_counter_add(&cache->space_info->total_bytes_pinned,
 +                                 -head->num_bytes);
 +              btrfs_put_block_group(cache);
 +
 +              if (head->is_data) {
 +                      spin_lock(&delayed_refs->lock);
 +                      delayed_refs->pending_csums -= head->num_bytes;
 +                      spin_unlock(&delayed_refs->lock);
 +              }
 +      }
 +
 +      if (head->must_insert_reserved) {
 +              btrfs_pin_extent(fs_info, head->bytenr,
 +                               head->num_bytes, 1);
 +              if (head->is_data) {
 +                      ret = btrfs_del_csums(trans, fs_info, head->bytenr,
 +                                            head->num_bytes);
 +              }
 +      }
 +
 +      /* Also free its reserved qgroup space */
 +      btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
 +                                    head->qgroup_reserved);
 +      btrfs_delayed_ref_unlock(head);
 +      btrfs_put_delayed_ref_head(head);
 +      return 0;
 +}
 +
  /*
   * Returns 0 on success or if called with an already aborted transaction.
   * Returns -ENOMEM or -EIO on failure and will abort the transaction.
@@@ -2713,7 -2655,11 +2713,7 @@@ static noinline int __btrfs_run_delayed
                if (ref && ref->seq &&
                    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
                        spin_unlock(&locked_ref->lock);
 -                      spin_lock(&delayed_refs->lock);
 -                      locked_ref->processing = 0;
 -                      delayed_refs->num_heads_ready++;
 -                      spin_unlock(&delayed_refs->lock);
 -                      btrfs_delayed_ref_unlock(locked_ref);
 +                      unselect_delayed_ref_head(delayed_refs, locked_ref);
                        locked_ref = NULL;
                        cond_resched();
                        count++;
                }
  
                /*
 -               * record the must insert reserved flag before we
 -               * drop the spin lock.
 +               * We're done processing refs in this ref_head, clean everything
 +               * up and move on to the next ref_head.
                 */
 -              must_insert_reserved = locked_ref->must_insert_reserved;
 -              locked_ref->must_insert_reserved = 0;
 -
 -              extent_op = locked_ref->extent_op;
 -              locked_ref->extent_op = NULL;
 -
                if (!ref) {
 -
 -
 -                      /* All delayed refs have been processed, Go ahead
 -                       * and send the head node to run_one_delayed_ref,
 -                       * so that any accounting fixes can happen
 -                       */
 -                      ref = &locked_ref->node;
 -
 -                      if (extent_op && must_insert_reserved) {
 -                              btrfs_free_delayed_extent_op(extent_op);
 -                              extent_op = NULL;
 -                      }
 -
 -                      if (extent_op) {
 -                              spin_unlock(&locked_ref->lock);
 -                              ret = run_delayed_extent_op(trans, fs_info,
 -                                                          ref, extent_op);
 -                              btrfs_free_delayed_extent_op(extent_op);
 -
 -                              if (ret) {
 -                                      /*
 -                                       * Need to reset must_insert_reserved if
 -                                       * there was an error so the abort stuff
 -                                       * can cleanup the reserved space
 -                                       * properly.
 -                                       */
 -                                      if (must_insert_reserved)
 -                                              locked_ref->must_insert_reserved = 1;
 -                                      spin_lock(&delayed_refs->lock);
 -                                      locked_ref->processing = 0;
 -                                      delayed_refs->num_heads_ready++;
 -                                      spin_unlock(&delayed_refs->lock);
 -                                      btrfs_debug(fs_info,
 -                                                  "run_delayed_extent_op returned %d",
 -                                                  ret);
 -                                      btrfs_delayed_ref_unlock(locked_ref);
 -                                      return ret;
 -                              }
 +                      ret = cleanup_ref_head(trans, fs_info, locked_ref);
 +                      if (ret > 0 ) {
 +                              /* We dropped our lock, we need to loop. */
 +                              ret = 0;
                                continue;
 +                      } else if (ret) {
 +                              return ret;
                        }
 +                      locked_ref = NULL;
 +                      count++;
 +                      continue;
 +              }
  
 -                      /*
 -                       * Need to drop our head ref lock and re-acquire the
 -                       * delayed ref lock and then re-check to make sure
 -                       * nobody got added.
 -                       */
 -                      spin_unlock(&locked_ref->lock);
 -                      spin_lock(&delayed_refs->lock);
 -                      spin_lock(&locked_ref->lock);
 -                      if (!list_empty(&locked_ref->ref_list) ||
 -                          locked_ref->extent_op) {
 -                              spin_unlock(&locked_ref->lock);
 -                              spin_unlock(&delayed_refs->lock);
 -                              continue;
 -                      }
 -                      ref->in_tree = 0;
 -                      delayed_refs->num_heads--;
 -                      rb_erase(&locked_ref->href_node,
 -                               &delayed_refs->href_root);
 -                      spin_unlock(&delayed_refs->lock);
 -              } else {
 -                      actual_count++;
 -                      ref->in_tree = 0;
 -                      list_del(&ref->list);
 -                      if (!list_empty(&ref->add_list))
 -                              list_del(&ref->add_list);
 +              actual_count++;
 +              ref->in_tree = 0;
 +              rb_erase(&ref->ref_node, &locked_ref->ref_tree);
 +              RB_CLEAR_NODE(&ref->ref_node);
 +              if (!list_empty(&ref->add_list))
 +                      list_del(&ref->add_list);
 +              /*
 +               * When we play the delayed ref, also correct the ref_mod on
 +               * head
 +               */
 +              switch (ref->action) {
 +              case BTRFS_ADD_DELAYED_REF:
 +              case BTRFS_ADD_DELAYED_EXTENT:
 +                      locked_ref->ref_mod -= ref->ref_mod;
 +                      break;
 +              case BTRFS_DROP_DELAYED_REF:
 +                      locked_ref->ref_mod += ref->ref_mod;
 +                      break;
 +              default:
 +                      WARN_ON(1);
                }
                atomic_dec(&delayed_refs->num_entries);
  
 -              if (!btrfs_delayed_ref_is_head(ref)) {
 -                      /*
 -                       * when we play the delayed ref, also correct the
 -                       * ref_mod on head
 -                       */
 -                      switch (ref->action) {
 -                      case BTRFS_ADD_DELAYED_REF:
 -                      case BTRFS_ADD_DELAYED_EXTENT:
 -                              locked_ref->node.ref_mod -= ref->ref_mod;
 -                              break;
 -                      case BTRFS_DROP_DELAYED_REF:
 -                              locked_ref->node.ref_mod += ref->ref_mod;
 -                              break;
 -                      default:
 -                              WARN_ON(1);
 -                      }
 -              }
 +              /*
 +               * Record the must-insert_reserved flag before we drop the spin
 +               * lock.
 +               */
 +              must_insert_reserved = locked_ref->must_insert_reserved;
 +              locked_ref->must_insert_reserved = 0;
 +
 +              extent_op = locked_ref->extent_op;
 +              locked_ref->extent_op = NULL;
                spin_unlock(&locked_ref->lock);
  
                ret = run_one_delayed_ref(trans, fs_info, ref, extent_op,
  
                btrfs_free_delayed_extent_op(extent_op);
                if (ret) {
 -                      spin_lock(&delayed_refs->lock);
 -                      locked_ref->processing = 0;
 -                      delayed_refs->num_heads_ready++;
 -                      spin_unlock(&delayed_refs->lock);
 -                      btrfs_delayed_ref_unlock(locked_ref);
 +                      unselect_delayed_ref_head(delayed_refs, locked_ref);
                        btrfs_put_delayed_ref(ref);
                        btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
                                    ret);
                        return ret;
                }
  
 -              /*
 -               * If this node is a head, that means all the refs in this head
 -               * have been dealt with, and we will pick the next head to deal
 -               * with, so we must unlock the head and drop it from the cluster
 -               * list before we release it.
 -               */
 -              if (btrfs_delayed_ref_is_head(ref)) {
 -                      if (locked_ref->is_data &&
 -                          locked_ref->total_ref_mod < 0) {
 -                              spin_lock(&delayed_refs->lock);
 -                              delayed_refs->pending_csums -= ref->num_bytes;
 -                              spin_unlock(&delayed_refs->lock);
 -                      }
 -                      btrfs_delayed_ref_unlock(locked_ref);
 -                      locked_ref = NULL;
 -              }
                btrfs_put_delayed_ref(ref);
                count++;
                cond_resched();
@@@ -3087,16 -3100,33 +3087,16 @@@ again
                        spin_unlock(&delayed_refs->lock);
                        goto out;
                }
 +              head = rb_entry(node, struct btrfs_delayed_ref_head,
 +                              href_node);
 +              refcount_inc(&head->refs);
 +              spin_unlock(&delayed_refs->lock);
  
 -              while (node) {
 -                      head = rb_entry(node, struct btrfs_delayed_ref_head,
 -                                      href_node);
 -                      if (btrfs_delayed_ref_is_head(&head->node)) {
 -                              struct btrfs_delayed_ref_node *ref;
 -
 -                              ref = &head->node;
 -                              refcount_inc(&ref->refs);
 -
 -                              spin_unlock(&delayed_refs->lock);
 -                              /*
 -                               * Mutex was contended, block until it's
 -                               * released and try again
 -                               */
 -                              mutex_lock(&head->mutex);
 -                              mutex_unlock(&head->mutex);
 +              /* Mutex was contended, block until it's released and retry. */
 +              mutex_lock(&head->mutex);
 +              mutex_unlock(&head->mutex);
  
 -                              btrfs_put_delayed_ref(ref);
 -                              cond_resched();
 -                              goto again;
 -                      } else {
 -                              WARN_ON(1);
 -                      }
 -                      node = rb_next(node);
 -              }
 -              spin_unlock(&delayed_refs->lock);
 +              btrfs_put_delayed_ref_head(head);
                cond_resched();
                goto again;
        }
@@@ -3139,7 -3169,6 +3139,7 @@@ static noinline int check_delayed_ref(s
        struct btrfs_delayed_data_ref *data_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_transaction *cur_trans;
 +      struct rb_node *node;
        int ret = 0;
  
        cur_trans = root->fs_info->running_transaction;
        }
  
        if (!mutex_trylock(&head->mutex)) {
 -              refcount_inc(&head->node.refs);
 +              refcount_inc(&head->refs);
                spin_unlock(&delayed_refs->lock);
  
                btrfs_release_path(path);
                 */
                mutex_lock(&head->mutex);
                mutex_unlock(&head->mutex);
 -              btrfs_put_delayed_ref(&head->node);
 +              btrfs_put_delayed_ref_head(head);
                return -EAGAIN;
        }
        spin_unlock(&delayed_refs->lock);
  
        spin_lock(&head->lock);
 -      list_for_each_entry(ref, &head->ref_list, list) {
 +      /*
 +       * XXX: We should replace this with a proper search function in the
 +       * future.
 +       */
 +      for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
 +              ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
                /* If it's a shared ref we know a cross reference exists */
                if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
                        ret = 1;
@@@ -3327,7 -3351,7 +3327,7 @@@ static int __btrfs_mod_ref(struct btrfs
        int level;
        int ret = 0;
        int (*process_func)(struct btrfs_trans_handle *,
 -                          struct btrfs_fs_info *,
 +                          struct btrfs_root *,
                            u64, u64, u64, u64, u64, u64);
  
  
  
                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
                        key.offset -= btrfs_file_extent_offset(buf, fi);
 -                      ret = process_func(trans, fs_info, bytenr, num_bytes,
 +                      ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, key.objectid,
                                           key.offset);
                        if (ret)
                } else {
                        bytenr = btrfs_node_blockptr(buf, i);
                        num_bytes = fs_info->nodesize;
 -                      ret = process_func(trans, fs_info, bytenr, num_bytes,
 +                      ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, level - 1, 0);
                        if (ret)
                                goto fail;
@@@ -3992,16 -4016,9 +3992,9 @@@ void btrfs_dec_nocow_writers(struct btr
        btrfs_put_block_group(bg);
  }
  
- static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
- {
-       schedule();
-       return 0;
- }
  void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
  {
-       wait_on_atomic_t(&bg->nocow_writers,
-                        btrfs_wait_nocow_writers_atomic_t,
+       wait_on_atomic_t(&bg->nocow_writers, atomic_t_wait,
                         TASK_UNINTERRUPTIBLE);
  }
  
@@@ -4819,6 -4836,7 +4812,6 @@@ static inline u64 calc_reclaim_items_nr
  static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
                            u64 orig, bool wait_ordered)
  {
 -      struct btrfs_block_rsv *block_rsv;
        struct btrfs_space_info *space_info;
        struct btrfs_trans_handle *trans;
        u64 delalloc_bytes;
        to_reclaim = items * EXTENT_SIZE_PER_ITEM;
  
        trans = (struct btrfs_trans_handle *)current->journal_info;
 -      block_rsv = &fs_info->delalloc_block_rsv;
 -      space_info = block_rsv->space_info;
 +      space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
  
        delalloc_bytes = percpu_counter_sum_positive(
                                                &fs_info->delalloc_bytes);
@@@ -4893,13 -4912,6 +4886,13 @@@ skip_async
        }
  }
  
 +struct reserve_ticket {
 +      u64 bytes;
 +      int error;
 +      struct list_head list;
 +      wait_queue_head_t wait;
 +};
 +
  /**
   * maybe_commit_transaction - possibly commit the transaction if its ok to
   * @root - the root we're allocating for
   * will return -ENOSPC.
   */
  static int may_commit_transaction(struct btrfs_fs_info *fs_info,
 -                                struct btrfs_space_info *space_info,
 -                                u64 bytes, int force)
 +                                struct btrfs_space_info *space_info)
  {
 +      struct reserve_ticket *ticket = NULL;
        struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
        struct btrfs_trans_handle *trans;
 +      u64 bytes;
  
        trans = (struct btrfs_trans_handle *)current->journal_info;
        if (trans)
                return -EAGAIN;
  
 -      if (force)
 -              goto commit;
 +      spin_lock(&space_info->lock);
 +      if (!list_empty(&space_info->priority_tickets))
 +              ticket = list_first_entry(&space_info->priority_tickets,
 +                                        struct reserve_ticket, list);
 +      else if (!list_empty(&space_info->tickets))
 +              ticket = list_first_entry(&space_info->tickets,
 +                                        struct reserve_ticket, list);
 +      bytes = (ticket) ? ticket->bytes : 0;
 +      spin_unlock(&space_info->lock);
 +
 +      if (!bytes)
 +              return 0;
  
        /* See if there is enough pinned space to make this reservation */
        if (percpu_counter_compare(&space_info->total_bytes_pinned,
                return -ENOSPC;
  
        spin_lock(&delayed_rsv->lock);
 +      if (delayed_rsv->size > bytes)
 +              bytes = 0;
 +      else
 +              bytes -= delayed_rsv->size;
        if (percpu_counter_compare(&space_info->total_bytes_pinned,
 -                                 bytes - delayed_rsv->size) < 0) {
 +                                 bytes) < 0) {
                spin_unlock(&delayed_rsv->lock);
                return -ENOSPC;
        }
@@@ -4967,6 -4964,13 +4960,6 @@@ commit
        return btrfs_commit_transaction(trans);
  }
  
 -struct reserve_ticket {
 -      u64 bytes;
 -      int error;
 -      struct list_head list;
 -      wait_queue_head_t wait;
 -};
 -
  /*
   * Try to flush some data based on policy set by @state. This is only advisory
   * and may fail for various reasons. The caller is supposed to examine the
@@@ -5016,7 -5020,8 +5009,7 @@@ static void flush_space(struct btrfs_fs
                        ret = 0;
                break;
        case COMMIT_TRANS:
 -              ret = may_commit_transaction(fs_info, space_info,
 -                                           num_bytes, 0);
 +              ret = may_commit_transaction(fs_info, space_info);
                break;
        default:
                ret = -ENOSPC;
@@@ -5570,12 -5575,11 +5563,12 @@@ again
        }
  }
  
 -static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 +static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
                                    struct btrfs_block_rsv *block_rsv,
                                    struct btrfs_block_rsv *dest, u64 num_bytes)
  {
        struct btrfs_space_info *space_info = block_rsv->space_info;
 +      u64 ret;
  
        spin_lock(&block_rsv->lock);
        if (num_bytes == (u64)-1)
        }
        spin_unlock(&block_rsv->lock);
  
 +      ret = num_bytes;
        if (num_bytes > 0) {
                if (dest) {
                        spin_lock(&dest->lock);
                        space_info_add_old_bytes(fs_info, space_info,
                                                 num_bytes);
        }
 +      return ret;
  }
  
  int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
@@@ -5634,15 -5636,6 +5627,15 @@@ void btrfs_init_block_rsv(struct btrfs_
        rsv->type = type;
  }
  
 +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
 +                                 struct btrfs_block_rsv *rsv,
 +                                 unsigned short type)
 +{
 +      btrfs_init_block_rsv(rsv, type);
 +      rsv->space_info = __find_space_info(fs_info,
 +                                          BTRFS_BLOCK_GROUP_METADATA);
 +}
 +
  struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
                                              unsigned short type)
  {
        if (!block_rsv)
                return NULL;
  
 -      btrfs_init_block_rsv(block_rsv, type);
 -      block_rsv->space_info = __find_space_info(fs_info,
 -                                                BTRFS_BLOCK_GROUP_METADATA);
 +      btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
        return block_rsv;
  }
  
@@@ -5735,66 -5730,6 +5728,66 @@@ int btrfs_block_rsv_refill(struct btrfs
        return ret;
  }
  
 +/**
 + * btrfs_inode_rsv_refill - refill the inode block rsv.
 + * @inode - the inode we are refilling.
 + * @flush - the flusing restriction.
 + *
 + * Essentially the same as btrfs_block_rsv_refill, except it uses the
 + * block_rsv->size as the minimum size.  We'll either refill the missing amount
 + * or return if we already have enough space.  This will also handle the resreve
 + * tracepoint for the reserved amount.
 + */
 +int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
 +                         enum btrfs_reserve_flush_enum flush)
 +{
 +      struct btrfs_root *root = inode->root;
 +      struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 +      u64 num_bytes = 0;
 +      int ret = -ENOSPC;
 +
 +      spin_lock(&block_rsv->lock);
 +      if (block_rsv->reserved < block_rsv->size)
 +              num_bytes = block_rsv->size - block_rsv->reserved;
 +      spin_unlock(&block_rsv->lock);
 +
 +      if (num_bytes == 0)
 +              return 0;
 +
 +      ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 +      if (!ret) {
 +              block_rsv_add_bytes(block_rsv, num_bytes, 0);
 +              trace_btrfs_space_reservation(root->fs_info, "delalloc",
 +                                            btrfs_ino(inode), num_bytes, 1);
 +      }
 +      return ret;
 +}
 +
 +/**
 + * btrfs_inode_rsv_release - release any excessive reservation.
 + * @inode - the inode we need to release from.
 + *
 + * This is the same as btrfs_block_rsv_release, except that it handles the
 + * tracepoint for the reservation.
 + */
 +void btrfs_inode_rsv_release(struct btrfs_inode *inode)
 +{
 +      struct btrfs_fs_info *fs_info = inode->root->fs_info;
 +      struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 +      struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 +      u64 released = 0;
 +
 +      /*
 +       * Since we statically set the block_rsv->size we just want to say we
 +       * are releasing 0 bytes, and then we'll just get the reservation over
 +       * the size free'd.
 +       */
 +      released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0);
 +      if (released > 0)
 +              trace_btrfs_space_reservation(fs_info, "delalloc",
 +                                            btrfs_ino(inode), released, 0);
 +}
 +
  void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
                             struct btrfs_block_rsv *block_rsv,
                             u64 num_bytes)
@@@ -5866,6 -5801,7 +5859,6 @@@ static void init_global_block_rsv(struc
  
        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
        fs_info->global_block_rsv.space_info = space_info;
 -      fs_info->delalloc_block_rsv.space_info = space_info;
        fs_info->trans_block_rsv.space_info = space_info;
        fs_info->empty_block_rsv.space_info = space_info;
        fs_info->delayed_block_rsv.space_info = space_info;
@@@ -5885,6 -5821,8 +5878,6 @@@ static void release_global_block_rsv(st
  {
        block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
                                (u64)-1);
 -      WARN_ON(fs_info->delalloc_block_rsv.size > 0);
 -      WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
        WARN_ON(fs_info->trans_block_rsv.size > 0);
        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
        WARN_ON(fs_info->chunk_block_rsv.size > 0);
  void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                  struct btrfs_fs_info *fs_info)
  {
 -      if (!trans->block_rsv)
 +      if (!trans->block_rsv) {
 +              ASSERT(!trans->bytes_reserved);
                return;
 +      }
  
        if (!trans->bytes_reserved)
                return;
  
 +      ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
        trace_btrfs_space_reservation(fs_info, "transaction",
                                      trans->transid, trans->bytes_reserved, 0);
        btrfs_block_rsv_release(fs_info, trans->block_rsv,
@@@ -6026,37 -5961,104 +6019,37 @@@ void btrfs_subvolume_release_metadata(s
        btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
  }
  
 -/**
 - * drop_outstanding_extent - drop an outstanding extent
 - * @inode: the inode we're dropping the extent for
 - * @num_bytes: the number of bytes we're releasing.
 - *
 - * This is called when we are freeing up an outstanding extent, either called
 - * after an error or after an extent is written.  This will return the number of
 - * reserved extents that need to be freed.  This must be called with
 - * BTRFS_I(inode)->lock held.
 - */
 -static unsigned drop_outstanding_extent(struct btrfs_inode *inode,
 -              u64 num_bytes)
 -{
 -      unsigned drop_inode_space = 0;
 -      unsigned dropped_extents = 0;
 -      unsigned num_extents;
 -
 -      num_extents = count_max_extents(num_bytes);
 -      ASSERT(num_extents);
 -      ASSERT(inode->outstanding_extents >= num_extents);
 -      inode->outstanding_extents -= num_extents;
 -
 -      if (inode->outstanding_extents == 0 &&
 -          test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
 -                             &inode->runtime_flags))
 -              drop_inode_space = 1;
 -
 -      /*
 -       * If we have more or the same amount of outstanding extents than we have
 -       * reserved then we need to leave the reserved extents count alone.
 -       */
 -      if (inode->outstanding_extents >= inode->reserved_extents)
 -              return drop_inode_space;
 -
 -      dropped_extents = inode->reserved_extents - inode->outstanding_extents;
 -      inode->reserved_extents -= dropped_extents;
 -      return dropped_extents + drop_inode_space;
 -}
 -
 -/**
 - * calc_csum_metadata_size - return the amount of metadata space that must be
 - *    reserved/freed for the given bytes.
 - * @inode: the inode we're manipulating
 - * @num_bytes: the number of bytes in question
 - * @reserve: 1 if we are reserving space, 0 if we are freeing space
 - *
 - * This adjusts the number of csum_bytes in the inode and then returns the
 - * correct amount of metadata that must either be reserved or freed.  We
 - * calculate how many checksums we can fit into one leaf and then divide the
 - * number of bytes that will need to be checksumed by this value to figure out
 - * how many checksums will be required.  If we are adding bytes then the number
 - * may go up and we will return the number of additional bytes that must be
 - * reserved.  If it is going down we will return the number of bytes that must
 - * be freed.
 - *
 - * This must be called with BTRFS_I(inode)->lock held.
 - */
 -static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes,
 -                                 int reserve)
 +static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 +                                               struct btrfs_inode *inode)
  {
 -      struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 -      u64 old_csums, num_csums;
 -
 -      if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0)
 -              return 0;
 -
 -      old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
 -      if (reserve)
 -              inode->csum_bytes += num_bytes;
 -      else
 -              inode->csum_bytes -= num_bytes;
 -      num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
 -
 -      /* No change, no need to reserve more */
 -      if (old_csums == num_csums)
 -              return 0;
 +      struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 +      u64 reserve_size = 0;
 +      u64 csum_leaves;
 +      unsigned outstanding_extents;
  
 -      if (reserve)
 -              return btrfs_calc_trans_metadata_size(fs_info,
 -                                                    num_csums - old_csums);
 +      lockdep_assert_held(&inode->lock);
 +      outstanding_extents = inode->outstanding_extents;
 +      if (outstanding_extents)
 +              reserve_size = btrfs_calc_trans_metadata_size(fs_info,
 +                                              outstanding_extents + 1);
 +      csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
 +                                               inode->csum_bytes);
 +      reserve_size += btrfs_calc_trans_metadata_size(fs_info,
 +                                                     csum_leaves);
  
 -      return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums);
 +      spin_lock(&block_rsv->lock);
 +      block_rsv->size = reserve_size;
 +      spin_unlock(&block_rsv->lock);
  }
  
  int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
  {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
        struct btrfs_root *root = inode->root;
 -      struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv;
 -      u64 to_reserve = 0;
 -      u64 csum_bytes;
        unsigned nr_extents;
        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
        int ret = 0;
        bool delalloc_lock = true;
 -      u64 to_free = 0;
 -      unsigned dropped;
 -      bool release_extra = false;
  
        /* If we are a free space inode we need to not flush since we will be in
         * the middle of a transaction commit.  We also don't need the delalloc
  
        num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
  
 +      /* Add our new extents and calculate the new rsv size. */
        spin_lock(&inode->lock);
        nr_extents = count_max_extents(num_bytes);
 -      inode->outstanding_extents += nr_extents;
 -
 -      nr_extents = 0;
 -      if (inode->outstanding_extents > inode->reserved_extents)
 -              nr_extents += inode->outstanding_extents -
 -                      inode->reserved_extents;
 -
 -      /* We always want to reserve a slot for updating the inode. */
 -      to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1);
 -      to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 -      csum_bytes = inode->csum_bytes;
 +      btrfs_mod_outstanding_extents(inode, nr_extents);
 +      inode->csum_bytes += num_bytes;
 +      btrfs_calculate_inode_block_rsv_size(fs_info, inode);
        spin_unlock(&inode->lock);
  
        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
                        goto out_fail;
        }
  
 -      ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
 +      ret = btrfs_inode_rsv_refill(inode, flush);
        if (unlikely(ret)) {
                btrfs_qgroup_free_meta(root,
                                       nr_extents * fs_info->nodesize);
                goto out_fail;
        }
  
 -      spin_lock(&inode->lock);
 -      if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
 -                           &inode->runtime_flags)) {
 -              to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1);
 -              release_extra = true;
 -      }
 -      inode->reserved_extents += nr_extents;
 -      spin_unlock(&inode->lock);
 -
        if (delalloc_lock)
                mutex_unlock(&inode->delalloc_mutex);
 -
 -      if (to_reserve)
 -              trace_btrfs_space_reservation(fs_info, "delalloc",
 -                                            btrfs_ino(inode), to_reserve, 1);
 -      if (release_extra)
 -              btrfs_block_rsv_release(fs_info, block_rsv,
 -                              btrfs_calc_trans_metadata_size(fs_info, 1));
        return 0;
  
  out_fail:
        spin_lock(&inode->lock);
 -      dropped = drop_outstanding_extent(inode, num_bytes);
 -      /*
 -       * If the inodes csum_bytes is the same as the original
 -       * csum_bytes then we know we haven't raced with any free()ers
 -       * so we can just reduce our inodes csum bytes and carry on.
 -       */
 -      if (inode->csum_bytes == csum_bytes) {
 -              calc_csum_metadata_size(inode, num_bytes, 0);
 -      } else {
 -              u64 orig_csum_bytes = inode->csum_bytes;
 -              u64 bytes;
 -
 -              /*
 -               * This is tricky, but first we need to figure out how much we
 -               * freed from any free-ers that occurred during this
 -               * reservation, so we reset ->csum_bytes to the csum_bytes
 -               * before we dropped our lock, and then call the free for the
 -               * number of bytes that were freed while we were trying our
 -               * reservation.
 -               */
 -              bytes = csum_bytes - inode->csum_bytes;
 -              inode->csum_bytes = csum_bytes;
 -              to_free = calc_csum_metadata_size(inode, bytes, 0);
 -
 -
 -              /*
 -               * Now we need to see how much we would have freed had we not
 -               * been making this reservation and our ->csum_bytes were not
 -               * artificially inflated.
 -               */
 -              inode->csum_bytes = csum_bytes - num_bytes;
 -              bytes = csum_bytes - orig_csum_bytes;
 -              bytes = calc_csum_metadata_size(inode, bytes, 0);
 -
 -              /*
 -               * Now reset ->csum_bytes to what it should be.  If bytes is
 -               * more than to_free then we would have freed more space had we
 -               * not had an artificially high ->csum_bytes, so we need to free
 -               * the remainder.  If bytes is the same or less then we don't
 -               * need to do anything, the other free-ers did the correct
 -               * thing.
 -               */
 -              inode->csum_bytes = orig_csum_bytes - num_bytes;
 -              if (bytes > to_free)
 -                      to_free = bytes - to_free;
 -              else
 -                      to_free = 0;
 -      }
 +      nr_extents = count_max_extents(num_bytes);
 +      btrfs_mod_outstanding_extents(inode, -nr_extents);
 +      inode->csum_bytes -= num_bytes;
 +      btrfs_calculate_inode_block_rsv_size(fs_info, inode);
        spin_unlock(&inode->lock);
 -      if (dropped)
 -              to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
  
 -      if (to_free) {
 -              btrfs_block_rsv_release(fs_info, block_rsv, to_free);
 -              trace_btrfs_space_reservation(fs_info, "delalloc",
 -                                            btrfs_ino(inode), to_free, 0);
 -      }
 +      btrfs_inode_rsv_release(inode);
        if (delalloc_lock)
                mutex_unlock(&inode->delalloc_mutex);
        return ret;
  
  /**
   * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
 - * @inode: the inode to release the reservation for
 - * @num_bytes: the number of bytes we're releasing
 + * @inode: the inode to release the reservation for.
 + * @num_bytes: the number of bytes we are releasing.
   *
   * This will release the metadata reservation for an inode.  This can be called
   * once we complete IO for a given set of bytes to release their metadata
 - * reservations.
 + * reservations, or on error for the same reason.
   */
  void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
  {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 -      u64 to_free = 0;
 -      unsigned dropped;
  
        num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
        spin_lock(&inode->lock);
 -      dropped = drop_outstanding_extent(inode, num_bytes);
 -
 -      if (num_bytes)
 -              to_free = calc_csum_metadata_size(inode, num_bytes, 0);
 +      inode->csum_bytes -= num_bytes;
 +      btrfs_calculate_inode_block_rsv_size(fs_info, inode);
        spin_unlock(&inode->lock);
 -      if (dropped > 0)
 -              to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
  
        if (btrfs_is_testing(fs_info))
                return;
  
 -      trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
 -                                    to_free, 0);
 +      btrfs_inode_rsv_release(inode);
 +}
  
 -      btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
 +/**
 + * btrfs_delalloc_release_extents - release our outstanding_extents
 + * @inode: the inode to balance the reservation for.
 + * @num_bytes: the number of bytes we originally reserved with
 + *
 + * When we reserve space we increase outstanding_extents for the extents we may
 + * add.  Once we've set the range as delalloc or created our ordered extents we
 + * have outstanding_extents to track the real usage, so we use this to free our
 + * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
 + * with btrfs_delalloc_reserve_metadata.
 + */
 +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
 +{
 +      struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 +      unsigned num_extents;
 +
 +      spin_lock(&inode->lock);
 +      num_extents = count_max_extents(num_bytes);
 +      btrfs_mod_outstanding_extents(inode, -num_extents);
 +      btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 +      spin_unlock(&inode->lock);
 +
 +      if (btrfs_is_testing(fs_info))
 +              return;
 +
 +      btrfs_inode_rsv_release(inode);
  }
  
  /**
@@@ -6219,7 -6275,10 +6212,7 @@@ int btrfs_delalloc_reserve_space(struc
   * @inode: inode we're releasing space for
   * @start: start position of the space already reserved
   * @len: the len of the space already reserved
 - *
 - * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
 - * called in the case that we don't need the metadata AND data reservations
 - * anymore.  So if there is an error or we insert an inline extent.
 + * @release_bytes: the len of the space we consumed or didn't use
   *
   * This function will release the metadata space that was not used and will
   * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
   * Also it will handle the qgroup reserved space.
   */
  void btrfs_delalloc_release_space(struct inode *inode,
 -                      struct extent_changeset *reserved, u64 start, u64 len)
 +                                struct extent_changeset *reserved,
 +                                u64 start, u64 len)
  {
        btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
        btrfs_free_reserved_data_space(inode, reserved, start, len);
@@@ -6530,12 -6588,6 +6523,6 @@@ void btrfs_dec_block_group_reservations
        btrfs_put_block_group(bg);
  }
  
- static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
- {
-       schedule();
-       return 0;
- }
  void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
  {
        struct btrfs_space_info *space_info = bg->space_info;
        down_write(&space_info->groups_sem);
        up_write(&space_info->groups_sem);
  
-       wait_on_atomic_t(&bg->reservations,
-                        btrfs_wait_bg_reservations_atomic_t,
+       wait_on_atomic_t(&bg->reservations, atomic_t_wait,
                         TASK_UNINTERRUPTIBLE);
  }
  
@@@ -6893,7 -6944,7 +6879,7 @@@ static int __btrfs_free_extent(struct b
        BUG_ON(!is_data && refs_to_drop != 1);
  
        if (is_data)
 -              skinny_metadata = 0;
 +              skinny_metadata = false;
  
        ret = lookup_extent_backref(trans, info, path, &iref,
                                    bytenr, num_bytes, parent,
@@@ -7148,7 -7199,7 +7134,7 @@@ static noinline int check_ref_cleanup(s
                goto out_delayed_unlock;
  
        spin_lock(&head->lock);
 -      if (!list_empty(&head->ref_list))
 +      if (!RB_EMPTY_ROOT(&head->ref_tree))
                goto out;
  
        if (head->extent_op) {
         * at this point we have a head with no other entries.  Go
         * ahead and process it.
         */
 -      head->node.in_tree = 0;
        rb_erase(&head->href_node, &delayed_refs->href_root);
 -
 +      RB_CLEAR_NODE(&head->href_node);
        atomic_dec(&delayed_refs->num_entries);
  
        /*
                ret = 1;
  
        mutex_unlock(&head->mutex);
 -      btrfs_put_delayed_ref(&head->node);
 +      btrfs_put_delayed_ref_head(head);
        return ret;
  out:
        spin_unlock(&head->lock);
@@@ -7211,10 -7263,6 +7197,10 @@@ void btrfs_free_tree_block(struct btrfs
        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                int old_ref_mod, new_ref_mod;
  
 +              btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
 +                                 root->root_key.objectid,
 +                                 btrfs_header_level(buf), 0,
 +                                 BTRFS_DROP_DELAYED_REF);
                ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start,
                                                 buf->len, parent,
                                                 root->root_key.objectid,
  
  /* Can return -ENOMEM */
  int btrfs_free_extent(struct btrfs_trans_handle *trans,
 -                    struct btrfs_fs_info *fs_info,
 +                    struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
                      u64 owner, u64 offset)
  {
 +      struct btrfs_fs_info *fs_info = root->fs_info;
        int old_ref_mod, new_ref_mod;
        int ret;
  
        if (btrfs_is_testing(fs_info))
                return 0;
  
 +      if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
 +              btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
 +                                 root_objectid, owner, offset,
 +                                 BTRFS_DROP_DELAYED_REF);
  
        /*
         * tree log blocks never actually go into the extent allocation
@@@ -8249,22 -8292,17 +8235,22 @@@ static int alloc_reserved_tree_block(st
  }
  
  int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 -                                   u64 root_objectid, u64 owner,
 +                                   struct btrfs_root *root, u64 owner,
                                     u64 offset, u64 ram_bytes,
                                     struct btrfs_key *ins)
  {
 -      struct btrfs_fs_info *fs_info = trans->fs_info;
 +      struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;
  
 -      BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
 +      BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
 +
 +      btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
 +                         root->root_key.objectid, owner, offset,
 +                         BTRFS_ADD_DELAYED_EXTENT);
  
        ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
 -                                       ins->offset, 0, root_objectid, owner,
 +                                       ins->offset, 0,
 +                                       root->root_key.objectid, owner,
                                         offset, ram_bytes,
                                         BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
        return ret;
@@@ -8486,9 -8524,6 +8472,9 @@@ struct extent_buffer *btrfs_alloc_tree_
                extent_op->is_data = false;
                extent_op->level = level;
  
 +              btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
 +                                 root_objectid, level, 0,
 +                                 BTRFS_ADD_DELAYED_EXTENT);
                ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid,
                                                 ins.offset, parent,
                                                 root_objectid, level,
@@@ -8845,7 -8880,7 +8831,7 @@@ skip
                                             ret);
                        }
                }
 -              ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize,
 +              ret = btrfs_free_extent(trans, root, bytenr, blocksize,
                                        parent, root->root_key.objectid,
                                        level - 1, 0);
                if (ret)
@@@ -9262,7 -9297,7 +9248,7 @@@ out
         * don't have it in the radix (like when we recover after a power fail
         * or unmount) so we don't leak memory.
         */
 -      if (!for_reloc && root_dropped == false)
 +      if (!for_reloc && !root_dropped)
                btrfs_add_dead_root(root);
        if (err && err != -EAGAIN)
                btrfs_handle_fs_error(fs_info, err, NULL);
@@@ -9919,9 -9954,9 +9905,9 @@@ int btrfs_free_block_groups(struct btrf
        return 0;
  }
  
 -static void __link_block_group(struct btrfs_space_info *space_info,
 -                             struct btrfs_block_group_cache *cache)
 +static void link_block_group(struct btrfs_block_group_cache *cache)
  {
 +      struct btrfs_space_info *space_info = cache->space_info;
        int index = get_block_group_index(cache);
        bool first = false;
  
@@@ -10129,7 -10164,7 +10115,7 @@@ int btrfs_read_block_groups(struct btrf
  
                cache->space_info = space_info;
  
 -              __link_block_group(space_info, cache);
 +              link_block_group(cache);
  
                set_avail_alloc_bits(info, cache->flags);
                if (btrfs_chunk_readonly(info, cache->key.objectid)) {
@@@ -10288,7 -10323,7 +10274,7 @@@ int btrfs_make_block_group(struct btrfs
                                cache->bytes_super, &cache->space_info);
        update_global_block_rsv(fs_info);
  
 -      __link_block_group(cache->space_info, cache);
 +      link_block_group(cache);
  
        list_add_tail(&cache->bg_list, &trans->new_bgs);
  
@@@ -10338,8 -10373,6 +10324,8 @@@ int btrfs_remove_block_group(struct btr
         * remove it.
         */
        free_excluded_extents(fs_info, block_group);
 +      btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
 +                                block_group->key.offset);
  
        memcpy(&key, &block_group->key, sizeof(key));
        index = get_block_group_index(block_group);
@@@ -11059,12 -11092,6 +11045,6 @@@ int btrfs_start_write_no_snapshotting(s
        return 1;
  }
  
- static int wait_snapshotting_atomic_t(atomic_t *a)
- {
-       schedule();
-       return 0;
- }
  void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
  {
        while (true) {
                ret = btrfs_start_write_no_snapshotting(root);
                if (ret)
                        break;
-               wait_on_atomic_t(&root->will_be_snapshotted,
-                                wait_snapshotting_atomic_t,
+               wait_on_atomic_t(&root->will_be_snapshotted, atomic_t_wait,
                                 TASK_UNINTERRUPTIBLE);
        }
  }
diff --combined mm/filemap.c
index 923fc2ebd74a94fc7f4024a6886f9c27cdf643a7,5bcc87adbeeb6020cbba0fc0b7f5dee03bce7224..ee83baaf855d555eac2f06a8adbd333836b3e6f7
@@@ -35,7 -35,6 +35,7 @@@
  #include <linux/hugetlb.h>
  #include <linux/memcontrol.h>
  #include <linux/cleancache.h>
 +#include <linux/shmem_fs.h>
  #include <linux/rmap.h>
  #include "internal.h"
  
@@@ -135,7 -134,7 +135,7 @@@ static int page_cache_tree_insert(struc
                        *shadowp = p;
        }
        __radix_tree_replace(&mapping->page_tree, node, slot, page,
 -                           workingset_update_node, mapping);
 +                           workingset_lookup_update(mapping));
        mapping->nrpages++;
        return 0;
  }
@@@ -163,12 -162,9 +163,12 @@@ static void page_cache_tree_delete(stru
  
                radix_tree_clear_tags(&mapping->page_tree, node, slot);
                __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
 -                                   workingset_update_node, mapping);
 +                              workingset_lookup_update(mapping));
        }
  
 +      page->mapping = NULL;
 +      /* Leave page->index set: truncation lookup relies upon it */
 +
        if (shadow) {
                mapping->nrexceptional += nr;
                /*
        mapping->nrpages -= nr;
  }
  
 -/*
 - * Delete a page from the page cache and free it. Caller has to make
 - * sure the page is locked and that nobody else uses it - or that usage
 - * is safe.  The caller must hold the mapping's tree_lock.
 - */
 -void __delete_from_page_cache(struct page *page, void *shadow)
 +static void unaccount_page_cache_page(struct address_space *mapping,
 +                                    struct page *page)
  {
 -      struct address_space *mapping = page->mapping;
 -      int nr = hpage_nr_pages(page);
 +      int nr;
  
 -      trace_mm_filemap_delete_from_page_cache(page);
        /*
         * if we're uptodate, flush out into the cleancache, otherwise
         * invalidate any existing cleancache entries.  We can't leave
                }
        }
  
 -      page_cache_tree_delete(mapping, page, shadow);
 -
 -      page->mapping = NULL;
 -      /* Leave page->index set: truncation lookup relies upon it */
 -
        /* hugetlb pages do not participate in page cache accounting. */
        if (PageHuge(page))
                return;
  
 +      nr = hpage_nr_pages(page);
 +
        __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
        if (PageSwapBacked(page)) {
                __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
        }
  
        /*
 -       * At this point page must be either written or cleaned by truncate.
 -       * Dirty page here signals a bug and loss of unwritten data.
 +       * At this point page must be either written or cleaned by
 +       * truncate.  Dirty page here signals a bug and loss of
 +       * unwritten data.
         *
 -       * This fixes dirty accounting after removing the page entirely but
 -       * leaves PageDirty set: it has no effect for truncated page and
 -       * anyway will be cleared before returning page into buddy allocator.
 +       * This fixes dirty accounting after removing the page entirely
 +       * but leaves PageDirty set: it has no effect for truncated
 +       * page and anyway will be cleared before returning page into
 +       * buddy allocator.
         */
        if (WARN_ON_ONCE(PageDirty(page)))
                account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
  }
  
 +/*
 + * Delete a page from the page cache and free it. Caller has to make
 + * sure the page is locked and that nobody else uses it - or that usage
 + * is safe.  The caller must hold the mapping's tree_lock.
 + */
 +void __delete_from_page_cache(struct page *page, void *shadow)
 +{
 +      struct address_space *mapping = page->mapping;
 +
 +      trace_mm_filemap_delete_from_page_cache(page);
 +
 +      unaccount_page_cache_page(mapping, page);
 +      page_cache_tree_delete(mapping, page, shadow);
 +}
 +
 +static void page_cache_free_page(struct address_space *mapping,
 +                              struct page *page)
 +{
 +      void (*freepage)(struct page *);
 +
 +      freepage = mapping->a_ops->freepage;
 +      if (freepage)
 +              freepage(page);
 +
 +      if (PageTransHuge(page) && !PageHuge(page)) {
 +              page_ref_sub(page, HPAGE_PMD_NR);
 +              VM_BUG_ON_PAGE(page_count(page) <= 0, page);
 +      } else {
 +              put_page(page);
 +      }
 +}
 +
  /**
   * delete_from_page_cache - delete page from page cache
   * @page: the page which the kernel is trying to remove from page cache
@@@ -295,98 -266,27 +295,98 @@@ void delete_from_page_cache(struct pag
  {
        struct address_space *mapping = page_mapping(page);
        unsigned long flags;
 -      void (*freepage)(struct page *);
  
        BUG_ON(!PageLocked(page));
 -
 -      freepage = mapping->a_ops->freepage;
 -
        spin_lock_irqsave(&mapping->tree_lock, flags);
        __delete_from_page_cache(page, NULL);
        spin_unlock_irqrestore(&mapping->tree_lock, flags);
  
 -      if (freepage)
 -              freepage(page);
 +      page_cache_free_page(mapping, page);
 +}
 +EXPORT_SYMBOL(delete_from_page_cache);
  
 -      if (PageTransHuge(page) && !PageHuge(page)) {
 -              page_ref_sub(page, HPAGE_PMD_NR);
 -              VM_BUG_ON_PAGE(page_count(page) <= 0, page);
 -      } else {
 -              put_page(page);
 +/*
 + * page_cache_tree_delete_batch - delete several pages from page cache
 + * @mapping: the mapping to which pages belong
 + * @pvec: pagevec with pages to delete
 + *
 + * The function walks over mapping->page_tree and removes pages passed in @pvec
 + * from the radix tree. The function expects @pvec to be sorted by page index.
 + * It tolerates holes in @pvec (radix tree entries at those indices are not
 + * modified). The function expects only THP head pages to be present in the
 + * @pvec and takes care to delete all corresponding tail pages from the radix
 + * tree as well.
 + *
 + * The function expects mapping->tree_lock to be held.
 + */
 +static void
 +page_cache_tree_delete_batch(struct address_space *mapping,
 +                           struct pagevec *pvec)
 +{
 +      struct radix_tree_iter iter;
 +      void **slot;
 +      int total_pages = 0;
 +      int i = 0, tail_pages = 0;
 +      struct page *page;
 +      pgoff_t start;
 +
 +      start = pvec->pages[0]->index;
 +      radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
 +              if (i >= pagevec_count(pvec) && !tail_pages)
 +                      break;
 +              page = radix_tree_deref_slot_protected(slot,
 +                                                     &mapping->tree_lock);
 +              if (radix_tree_exceptional_entry(page))
 +                      continue;
 +              if (!tail_pages) {
 +                      /*
 +                       * Some page got inserted in our range? Skip it. We
 +                       * have our pages locked so they are protected from
 +                       * being removed.
 +                       */
 +                      if (page != pvec->pages[i])
 +                              continue;
 +                      WARN_ON_ONCE(!PageLocked(page));
 +                      if (PageTransHuge(page) && !PageHuge(page))
 +                              tail_pages = HPAGE_PMD_NR - 1;
 +                      page->mapping = NULL;
 +                      /*
 +                       * Leave page->index set: truncation lookup relies
 +                       * upon it
 +                       */
 +                      i++;
 +              } else {
 +                      tail_pages--;
 +              }
 +              radix_tree_clear_tags(&mapping->page_tree, iter.node, slot);
 +              __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL,
 +                              workingset_lookup_update(mapping));
 +              total_pages++;
        }
 +      mapping->nrpages -= total_pages;
 +}
 +
 +void delete_from_page_cache_batch(struct address_space *mapping,
 +                                struct pagevec *pvec)
 +{
 +      int i;
 +      unsigned long flags;
 +
 +      if (!pagevec_count(pvec))
 +              return;
 +
 +      spin_lock_irqsave(&mapping->tree_lock, flags);
 +      for (i = 0; i < pagevec_count(pvec); i++) {
 +              trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
 +
 +              unaccount_page_cache_page(mapping, pvec->pages[i]);
 +      }
 +      page_cache_tree_delete_batch(mapping, pvec);
 +      spin_unlock_irqrestore(&mapping->tree_lock, flags);
 +
 +      for (i = 0; i < pagevec_count(pvec); i++)
 +              page_cache_free_page(mapping, pvec->pages[i]);
  }
 -EXPORT_SYMBOL(delete_from_page_cache);
  
  int filemap_check_errors(struct address_space *mapping)
  {
@@@ -519,18 -419,20 +519,18 @@@ static void __filemap_fdatawait_range(s
        if (end_byte < start_byte)
                return;
  
 -      pagevec_init(&pvec, 0);
 -      while ((index <= end) &&
 -                      (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 -                      PAGECACHE_TAG_WRITEBACK,
 -                      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
 +      pagevec_init(&pvec);
 +      while (index <= end) {
                unsigned i;
  
 +              nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
 +                              end, PAGECACHE_TAG_WRITEBACK);
 +              if (!nr_pages)
 +                      break;
 +
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
  
 -                      /* until radix tree lookup accepts end_index */
 -                      if (page->index > end)
 -                              continue;
 -
                        wait_on_page_writeback(page);
                        ClearPageError(page);
                }
@@@ -1139,6 -1041,7 +1139,7 @@@ int wait_on_page_bit_killable(struct pa
        wait_queue_head_t *q = page_waitqueue(page);
        return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
  }
+ EXPORT_SYMBOL(wait_on_page_bit_killable);
  
  /**
   * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
@@@ -1852,10 -1755,9 +1853,10 @@@ repeat
  EXPORT_SYMBOL(find_get_pages_contig);
  
  /**
 - * find_get_pages_tag - find and return pages that match @tag
 + * find_get_pages_range_tag - find and return pages in given range matching @tag
   * @mapping:  the address_space to search
   * @index:    the starting page index
 + * @end:      The final page index (inclusive)
   * @tag:      the tag index
   * @nr_pages: the maximum number of pages
   * @pages:    where the resulting pages are placed
   * Like find_get_pages, except we only return pages which are tagged with
   * @tag.   We update @index to index the next page for the traversal.
   */
 -unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 -                      int tag, unsigned int nr_pages, struct page **pages)
 +unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
 +                      pgoff_t end, int tag, unsigned int nr_pages,
 +                      struct page **pages)
  {
        struct radix_tree_iter iter;
        void **slot;
        radix_tree_for_each_tagged(slot, &mapping->page_tree,
                                   &iter, *index, tag) {
                struct page *head, *page;
 +
 +              if (iter.index > end)
 +                      break;
  repeat:
                page = radix_tree_deref_slot(slot);
                if (unlikely(!page))
                }
  
                pages[ret] = page;
 -              if (++ret == nr_pages)
 -                      break;
 +              if (++ret == nr_pages) {
 +                      *index = pages[ret - 1]->index + 1;
 +                      goto out;
 +              }
        }
  
 +      /*
 +       * We come here when we got at @end. We take care to not overflow the
 +       * index @index as it confuses some of the callers. This breaks the
 +       * iteration when there is page at index -1 but that is already broken
 +       * anyway.
 +       */
 +      if (end == (pgoff_t)-1)
 +              *index = (pgoff_t)-1;
 +      else
 +              *index = end + 1;
 +out:
        rcu_read_unlock();
  
 -      if (ret)
 -              *index = pages[ret - 1]->index + 1;
 -
        return ret;
  }
 -EXPORT_SYMBOL(find_get_pages_tag);
 +EXPORT_SYMBOL(find_get_pages_range_tag);
  
  /**
   * find_get_entries_tag - find and return entries that match @tag
@@@ -2272,7 -2160,7 +2273,7 @@@ no_cached_page
                 * Ok, it wasn't cached, so we need to create a new
                 * page..
                 */
 -              page = page_cache_alloc_cold(mapping);
 +              page = page_cache_alloc(mapping);
                if (!page) {
                        error = -ENOMEM;
                        goto out;
@@@ -2384,7 -2272,7 +2385,7 @@@ static int page_cache_read(struct file 
        int ret;
  
        do {
 -              page = __page_cache_alloc(gfp_mask|__GFP_COLD);
 +              page = __page_cache_alloc(gfp_mask);
                if (!page)
                        return -ENOMEM;
  
@@@ -2788,7 -2676,7 +2789,7 @@@ static struct page *do_read_cache_page(
  repeat:
        page = find_get_page(mapping, index);
        if (!page) {
 -              page = __page_cache_alloc(gfp | __GFP_COLD);
 +              page = __page_cache_alloc(gfp);
                if (!page)
                        return ERR_PTR(-ENOMEM);
                err = add_to_page_cache_lru(page, mapping, index, gfp);