Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs...
authorChris Mason <chris.mason@oracle.com>
Thu, 24 Sep 2009 14:00:58 +0000 (10:00 -0400)
committerChris Mason <chris.mason@oracle.com>
Thu, 24 Sep 2009 14:00:58 +0000 (10:00 -0400)
Conflicts:
fs/btrfs/super.c

31 files changed:
fs/btrfs/async-thread.c
fs/btrfs/async-thread.h
fs/btrfs/btrfs_inode.h
fs/btrfs/compression.c
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/dir-item.c
fs/btrfs/disk-io.c
fs/btrfs/export.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/extent_map.c
fs/btrfs/extent_map.h
fs/btrfs/file.c
fs/btrfs/free-space-cache.c
fs/btrfs/inode-item.c
fs/btrfs/inode-map.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/ioctl.h
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h
fs/btrfs/orphan.c
fs/btrfs/relocation.c
fs/btrfs/root-tree.c
fs/btrfs/super.c
fs/btrfs/transaction.c
fs/btrfs/tree-log.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h

index 019e8af449abfeb1679c1a20c801d6b0082edf8b..282ca085c2fbff854bcf4a396d0773db129c5085 100644 (file)
@@ -48,6 +48,9 @@ struct btrfs_worker_thread {
        /* number of things on the pending list */
        atomic_t num_pending;
 
+       /* reference counter for this struct */
+       atomic_t refs;
+
        unsigned long sequence;
 
        /* protects the pending list. */
@@ -71,7 +74,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker)
                unsigned long flags;
                spin_lock_irqsave(&worker->workers->lock, flags);
                worker->idle = 1;
-               list_move(&worker->worker_list, &worker->workers->idle_list);
+
+               /* the list may be empty if the worker is just starting */
+               if (!list_empty(&worker->worker_list)) {
+                       list_move(&worker->worker_list,
+                                &worker->workers->idle_list);
+               }
                spin_unlock_irqrestore(&worker->workers->lock, flags);
        }
 }
@@ -87,23 +95,49 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
                unsigned long flags;
                spin_lock_irqsave(&worker->workers->lock, flags);
                worker->idle = 0;
-               list_move_tail(&worker->worker_list,
-                              &worker->workers->worker_list);
+
+               if (!list_empty(&worker->worker_list)) {
+                       list_move_tail(&worker->worker_list,
+                                     &worker->workers->worker_list);
+               }
                spin_unlock_irqrestore(&worker->workers->lock, flags);
        }
 }
 
-static noinline int run_ordered_completions(struct btrfs_workers *workers,
-                                           struct btrfs_work *work)
+static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
 {
+       struct btrfs_workers *workers = worker->workers;
        unsigned long flags;
 
+       rmb();
+       if (!workers->atomic_start_pending)
+               return;
+
+       spin_lock_irqsave(&workers->lock, flags);
+       if (!workers->atomic_start_pending)
+               goto out;
+
+       workers->atomic_start_pending = 0;
+       if (workers->num_workers >= workers->max_workers)
+               goto out;
+
+       spin_unlock_irqrestore(&workers->lock, flags);
+       btrfs_start_workers(workers, 1);
+       return;
+
+out:
+       spin_unlock_irqrestore(&workers->lock, flags);
+}
+
+static noinline int run_ordered_completions(struct btrfs_workers *workers,
+                                           struct btrfs_work *work)
+{
        if (!workers->ordered)
                return 0;
 
        set_bit(WORK_DONE_BIT, &work->flags);
 
-       spin_lock_irqsave(&workers->lock, flags);
+       spin_lock(&workers->order_lock);
 
        while (1) {
                if (!list_empty(&workers->prio_order_list)) {
@@ -126,45 +160,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
                if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
                        break;
 
-               spin_unlock_irqrestore(&workers->lock, flags);
+               spin_unlock(&workers->order_lock);
 
                work->ordered_func(work);
 
                /* now take the lock again and call the freeing code */
-               spin_lock_irqsave(&workers->lock, flags);
+               spin_lock(&workers->order_lock);
                list_del(&work->order_list);
                work->ordered_free(work);
        }
 
-       spin_unlock_irqrestore(&workers->lock, flags);
+       spin_unlock(&workers->order_lock);
        return 0;
 }
 
+static void put_worker(struct btrfs_worker_thread *worker)
+{
+       if (atomic_dec_and_test(&worker->refs))
+               kfree(worker);
+}
+
+static int try_worker_shutdown(struct btrfs_worker_thread *worker)
+{
+       int freeit = 0;
+
+       spin_lock_irq(&worker->lock);
+       spin_lock(&worker->workers->lock);
+       if (worker->workers->num_workers > 1 &&
+           worker->idle &&
+           !worker->working &&
+           !list_empty(&worker->worker_list) &&
+           list_empty(&worker->prio_pending) &&
+           list_empty(&worker->pending) &&
+           atomic_read(&worker->num_pending) == 0) {
+               freeit = 1;
+               list_del_init(&worker->worker_list);
+               worker->workers->num_workers--;
+       }
+       spin_unlock(&worker->workers->lock);
+       spin_unlock_irq(&worker->lock);
+
+       if (freeit)
+               put_worker(worker);
+       return freeit;
+}
+
+static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
+                                       struct list_head *prio_head,
+                                       struct list_head *head)
+{
+       struct btrfs_work *work = NULL;
+       struct list_head *cur = NULL;
+
+       if(!list_empty(prio_head))
+               cur = prio_head->next;
+
+       smp_mb();
+       if (!list_empty(&worker->prio_pending))
+               goto refill;
+
+       if (!list_empty(head))
+               cur = head->next;
+
+       if (cur)
+               goto out;
+
+refill:
+       spin_lock_irq(&worker->lock);
+       list_splice_tail_init(&worker->prio_pending, prio_head);
+       list_splice_tail_init(&worker->pending, head);
+
+       if (!list_empty(prio_head))
+               cur = prio_head->next;
+       else if (!list_empty(head))
+               cur = head->next;
+       spin_unlock_irq(&worker->lock);
+
+       if (!cur)
+               goto out_fail;
+
+out:
+       work = list_entry(cur, struct btrfs_work, list);
+
+out_fail:
+       return work;
+}
+
 /*
  * main loop for servicing work items
  */
 static int worker_loop(void *arg)
 {
        struct btrfs_worker_thread *worker = arg;
-       struct list_head *cur;
+       struct list_head head;
+       struct list_head prio_head;
        struct btrfs_work *work;
+
+       INIT_LIST_HEAD(&head);
+       INIT_LIST_HEAD(&prio_head);
+
        do {
-               spin_lock_irq(&worker->lock);
-again_locked:
+again:
                while (1) {
-                       if (!list_empty(&worker->prio_pending))
-                               cur = worker->prio_pending.next;
-                       else if (!list_empty(&worker->pending))
-                               cur = worker->pending.next;
-                       else
+
+
+                       work = get_next_work(worker, &prio_head, &head);
+                       if (!work)
                                break;
 
-                       work = list_entry(cur, struct btrfs_work, list);
                        list_del(&work->list);
                        clear_bit(WORK_QUEUED_BIT, &work->flags);
 
                        work->worker = worker;
-                       spin_unlock_irq(&worker->lock);
 
                        work->func(work);
 
@@ -175,9 +282,13 @@ again_locked:
                         */
                        run_ordered_completions(worker->workers, work);
 
-                       spin_lock_irq(&worker->lock);
-                       check_idle_worker(worker);
+                       check_pending_worker_creates(worker);
+
                }
+
+               spin_lock_irq(&worker->lock);
+               check_idle_worker(worker);
+
                if (freezing(current)) {
                        worker->working = 0;
                        spin_unlock_irq(&worker->lock);
@@ -216,8 +327,10 @@ again_locked:
                                spin_lock_irq(&worker->lock);
                                set_current_state(TASK_INTERRUPTIBLE);
                                if (!list_empty(&worker->pending) ||
-                                   !list_empty(&worker->prio_pending))
-                                       goto again_locked;
+                                   !list_empty(&worker->prio_pending)) {
+                                       spin_unlock_irq(&worker->lock);
+                                       goto again;
+                               }
 
                                /*
                                 * this makes sure we get a wakeup when someone
@@ -226,8 +339,13 @@ again_locked:
                                worker->working = 0;
                                spin_unlock_irq(&worker->lock);
 
-                               if (!kthread_should_stop())
-                                       schedule();
+                               if (!kthread_should_stop()) {
+                                       schedule_timeout(HZ * 120);
+                                       if (!worker->working &&
+                                           try_worker_shutdown(worker)) {
+                                               return 0;
+                                       }
+                               }
                        }
                        __set_current_state(TASK_RUNNING);
                }
@@ -242,16 +360,30 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
 {
        struct list_head *cur;
        struct btrfs_worker_thread *worker;
+       int can_stop;
 
+       spin_lock_irq(&workers->lock);
        list_splice_init(&workers->idle_list, &workers->worker_list);
        while (!list_empty(&workers->worker_list)) {
                cur = workers->worker_list.next;
                worker = list_entry(cur, struct btrfs_worker_thread,
                                    worker_list);
-               kthread_stop(worker->task);
-               list_del(&worker->worker_list);
-               kfree(worker);
+
+               atomic_inc(&worker->refs);
+               workers->num_workers -= 1;
+               if (!list_empty(&worker->worker_list)) {
+                       list_del_init(&worker->worker_list);
+                       put_worker(worker);
+                       can_stop = 1;
+               } else
+                       can_stop = 0;
+               spin_unlock_irq(&workers->lock);
+               if (can_stop)
+                       kthread_stop(worker->task);
+               spin_lock_irq(&workers->lock);
+               put_worker(worker);
        }
+       spin_unlock_irq(&workers->lock);
        return 0;
 }
 
@@ -266,10 +398,13 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
        INIT_LIST_HEAD(&workers->order_list);
        INIT_LIST_HEAD(&workers->prio_order_list);
        spin_lock_init(&workers->lock);
+       spin_lock_init(&workers->order_lock);
        workers->max_workers = max;
        workers->idle_thresh = 32;
        workers->name = name;
        workers->ordered = 0;
+       workers->atomic_start_pending = 0;
+       workers->atomic_worker_start = 0;
 }
 
 /*
@@ -293,7 +428,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
                INIT_LIST_HEAD(&worker->prio_pending);
                INIT_LIST_HEAD(&worker->worker_list);
                spin_lock_init(&worker->lock);
+
                atomic_set(&worker->num_pending, 0);
+               atomic_set(&worker->refs, 1);
                worker->workers = workers;
                worker->task = kthread_run(worker_loop, worker,
                                           "btrfs-%s-%d", workers->name,
@@ -303,7 +440,6 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
                        kfree(worker);
                        goto fail;
                }
-
                spin_lock_irq(&workers->lock);
                list_add_tail(&worker->worker_list, &workers->idle_list);
                worker->idle = 1;
@@ -350,7 +486,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
         */
        next = workers->worker_list.next;
        worker = list_entry(next, struct btrfs_worker_thread, worker_list);
-       atomic_inc(&worker->num_pending);
        worker->sequence++;
 
        if (worker->sequence % workers->idle_thresh == 0)
@@ -367,28 +502,18 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
 {
        struct btrfs_worker_thread *worker;
        unsigned long flags;
+       struct list_head *fallback;
 
 again:
        spin_lock_irqsave(&workers->lock, flags);
        worker = next_worker(workers);
-       spin_unlock_irqrestore(&workers->lock, flags);
 
        if (!worker) {
-               spin_lock_irqsave(&workers->lock, flags);
                if (workers->num_workers >= workers->max_workers) {
-                       struct list_head *fallback = NULL;
-                       /*
-                        * we have failed to find any workers, just
-                        * return the force one
-                        */
-                       if (!list_empty(&workers->worker_list))
-                               fallback = workers->worker_list.next;
-                       if (!list_empty(&workers->idle_list))
-                               fallback = workers->idle_list.next;
-                       BUG_ON(!fallback);
-                       worker = list_entry(fallback,
-                                 struct btrfs_worker_thread, worker_list);
-                       spin_unlock_irqrestore(&workers->lock, flags);
+                       goto fallback;
+               } else if (workers->atomic_worker_start) {
+                       workers->atomic_start_pending = 1;
+                       goto fallback;
                } else {
                        spin_unlock_irqrestore(&workers->lock, flags);
                        /* we're below the limit, start another worker */
@@ -396,6 +521,28 @@ again:
                        goto again;
                }
        }
+       goto found;
+
+fallback:
+       fallback = NULL;
+       /*
+        * we have failed to find any workers, just
+        * return the first one we can find.
+        */
+       if (!list_empty(&workers->worker_list))
+               fallback = workers->worker_list.next;
+       if (!list_empty(&workers->idle_list))
+               fallback = workers->idle_list.next;
+       BUG_ON(!fallback);
+       worker = list_entry(fallback,
+                 struct btrfs_worker_thread, worker_list);
+found:
+       /*
+        * this makes sure the worker doesn't exit before it is placed
+        * onto a busy/idle list
+        */
+       atomic_inc(&worker->num_pending);
+       spin_unlock_irqrestore(&workers->lock, flags);
        return worker;
 }
 
@@ -427,7 +574,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
                spin_lock(&worker->workers->lock);
                worker->idle = 0;
                list_move_tail(&worker->worker_list,
-                              &worker->workers->worker_list);
+                             &worker->workers->worker_list);
                spin_unlock(&worker->workers->lock);
        }
        if (!worker->working) {
@@ -435,9 +582,9 @@ int btrfs_requeue_work(struct btrfs_work *work)
                worker->working = 1;
        }
 
-       spin_unlock_irqrestore(&worker->lock, flags);
        if (wake)
                wake_up_process(worker->task);
+       spin_unlock_irqrestore(&worker->lock, flags);
 out:
 
        return 0;
@@ -463,14 +610,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 
        worker = find_worker(workers);
        if (workers->ordered) {
-               spin_lock_irqsave(&workers->lock, flags);
+               /*
+                * you're not allowed to do ordered queues from an
+                * interrupt handler
+                */
+               spin_lock(&workers->order_lock);
                if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
                        list_add_tail(&work->order_list,
                                      &workers->prio_order_list);
                } else {
                        list_add_tail(&work->order_list, &workers->order_list);
                }
-               spin_unlock_irqrestore(&workers->lock, flags);
+               spin_unlock(&workers->order_lock);
        } else {
                INIT_LIST_HEAD(&work->order_list);
        }
@@ -481,7 +632,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
                list_add_tail(&work->list, &worker->prio_pending);
        else
                list_add_tail(&work->list, &worker->pending);
-       atomic_inc(&worker->num_pending);
        check_busy_worker(worker);
 
        /*
@@ -492,10 +642,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
                wake = 1;
        worker->working = 1;
 
-       spin_unlock_irqrestore(&worker->lock, flags);
-
        if (wake)
                wake_up_process(worker->task);
+       spin_unlock_irqrestore(&worker->lock, flags);
+
 out:
        return 0;
 }
index 1b511c109db658ef1d772bb90d77b795f780e63d..fc089b95ec14f24c9a971b44f3718c2029f241ef 100644 (file)
@@ -73,6 +73,15 @@ struct btrfs_workers {
        /* force completions in the order they were queued */
        int ordered;
 
+       /* more workers required, but in an interrupt handler */
+       int atomic_start_pending;
+
+       /*
+        * are we allowed to sleep while starting workers or are we required
+        * to start them at a later time?
+        */
+       int atomic_worker_start;
+
        /* list with all the work threads.  The workers on the idle thread
         * may be actively servicing jobs, but they haven't yet hit the
         * idle thresh limit above.
@@ -90,6 +99,9 @@ struct btrfs_workers {
        /* lock for finding the next worker thread to queue on */
        spinlock_t lock;
 
+       /* lock for the ordered lists */
+       spinlock_t order_lock;
+
        /* extra name for this worker, used for current->name */
        char *name;
 };
index ea1ea0af8c0e6cf635c3060a08e1dd648a6fb48f..82ee56bba29966e5dee552d764bb0e60fbc8bca9 100644 (file)
@@ -138,6 +138,7 @@ struct btrfs_inode {
         * of these.
         */
        unsigned ordered_data_close:1;
+       unsigned dummy_inode:1;
 
        struct inode vfs_inode;
 };
index 9d8ba4d54a37c3f96e9585de46b8e99d701410bc..a11a32058b50a4993f072fd1baddc6e9dafb52a8 100644 (file)
@@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                 */
                set_page_extent_mapped(page);
                lock_extent(tree, last_offset, end, GFP_NOFS);
-               spin_lock(&em_tree->lock);
+               read_lock(&em_tree->lock);
                em = lookup_extent_mapping(em_tree, last_offset,
                                           PAGE_CACHE_SIZE);
-               spin_unlock(&em_tree->lock);
+               read_unlock(&em_tree->lock);
 
                if (!em || last_offset < em->start ||
                    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
@@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        em_tree = &BTRFS_I(inode)->extent_tree;
 
        /* we need the actual starting offset of this extent in the file */
-       spin_lock(&em_tree->lock);
+       read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree,
                                   page_offset(bio->bi_io_vec->bv_page),
                                   PAGE_CACHE_SIZE);
-       spin_unlock(&em_tree->lock);
+       read_unlock(&em_tree->lock);
 
        compressed_len = em->block_len;
        cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
index 3fdcc0512d3ab62f95d42708ca0d6a049340b877..ec96f3a6d536640919dd25a08c7ed22e4423ef15 100644 (file)
@@ -2853,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
        int split;
        int num_doubles = 0;
 
+       l = path->nodes[0];
+       slot = path->slots[0];
+       if (extend && data_size + btrfs_item_size_nr(l, slot) +
+           sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
+               return -EOVERFLOW;
+
        /* first try to make some room by pushing left and right */
        if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
                wret = push_leaf_right(trans, root, path, data_size, 0);
index 837435ce84caa104dcb00ba9830df29a47e95923..80599b4e42bd350f5e6b7a094c3711a3dd32e8a8 100644 (file)
@@ -114,6 +114,10 @@ struct btrfs_ordered_sum;
  */
 #define BTRFS_DEV_ITEMS_OBJECTID 1ULL
 
+#define BTRFS_BTREE_INODE_OBJECTID 1
+
+#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
+
 /*
  * we can actually store much bigger names, but lets not confuse the rest
  * of linux
@@ -670,6 +674,7 @@ struct btrfs_space_info {
        u64 bytes_reserved;     /* total bytes the allocator has reserved for
                                   current allocations */
        u64 bytes_readonly;     /* total bytes that are read only */
+       u64 bytes_super;        /* total bytes reserved for the super blocks */
 
        /* delalloc accounting */
        u64 bytes_delalloc;     /* number of bytes reserved for allocation,
@@ -726,6 +731,15 @@ enum btrfs_caching_type {
        BTRFS_CACHE_FINISHED    = 2,
 };
 
+struct btrfs_caching_control {
+       struct list_head list;
+       struct mutex mutex;
+       wait_queue_head_t wait;
+       struct btrfs_block_group_cache *block_group;
+       u64 progress;
+       atomic_t count;
+};
+
 struct btrfs_block_group_cache {
        struct btrfs_key key;
        struct btrfs_block_group_item item;
@@ -733,6 +747,7 @@ struct btrfs_block_group_cache {
        spinlock_t lock;
        u64 pinned;
        u64 reserved;
+       u64 bytes_super;
        u64 flags;
        u64 sectorsize;
        int extents_thresh;
@@ -742,8 +757,9 @@ struct btrfs_block_group_cache {
        int dirty;
 
        /* cache tracking stuff */
-       wait_queue_head_t caching_q;
        int cached;
+       struct btrfs_caching_control *caching_ctl;
+       u64 last_byte_to_unpin;
 
        struct btrfs_space_info *space_info;
 
@@ -782,13 +798,16 @@ struct btrfs_fs_info {
 
        /* the log root tree is a directory of all the other log roots */
        struct btrfs_root *log_root_tree;
+
+       spinlock_t fs_roots_radix_lock;
        struct radix_tree_root fs_roots_radix;
 
        /* block group cache stuff */
        spinlock_t block_group_cache_lock;
        struct rb_root block_group_cache_tree;
 
-       struct extent_io_tree pinned_extents;
+       struct extent_io_tree freed_extents[2];
+       struct extent_io_tree *pinned_extents;
 
        /* logical->physical extent mapping */
        struct btrfs_mapping_tree mapping_tree;
@@ -822,11 +841,7 @@ struct btrfs_fs_info {
        struct mutex transaction_kthread_mutex;
        struct mutex cleaner_mutex;
        struct mutex chunk_mutex;
-       struct mutex drop_mutex;
        struct mutex volume_mutex;
-       struct mutex tree_reloc_mutex;
-       struct rw_semaphore extent_commit_sem;
-
        /*
         * this protects the ordered operations list only while we are
         * processing all of the entries on it.  This way we make
@@ -835,10 +850,16 @@ struct btrfs_fs_info {
         * before jumping into the main commit.
         */
        struct mutex ordered_operations_mutex;
+       struct rw_semaphore extent_commit_sem;
+
+       struct rw_semaphore subvol_sem;
+
+       struct srcu_struct subvol_srcu;
 
        struct list_head trans_list;
        struct list_head hashers;
        struct list_head dead_roots;
+       struct list_head caching_block_groups;
 
        atomic_t nr_async_submits;
        atomic_t async_submit_draining;
@@ -996,10 +1017,12 @@ struct btrfs_root {
        u32 stripesize;
 
        u32 type;
-       u64 highest_inode;
-       u64 last_inode_alloc;
+
+       u64 highest_objectid;
        int ref_cows;
        int track_dirty;
+       int in_radix;
+
        u64 defrag_trans_start;
        struct btrfs_key defrag_progress;
        struct btrfs_key defrag_max;
@@ -1920,8 +1943,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_update_pinned_extents(struct btrfs_root *root,
-                               u64 bytenr, u64 num, int pin);
+int btrfs_pin_extent(struct btrfs_root *root,
+                    u64 bytenr, u64 num, int reserved);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
@@ -1971,9 +1994,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      u64 root_objectid, u64 owner, u64 offset);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root,
-                              struct extent_io_tree *unpin);
+                              struct btrfs_root *root);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
@@ -1984,6 +2008,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, u64 bytes_used,
                           u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -2006,7 +2031,6 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
                                 u64 bytes);
 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
                              u64 bytes);
-void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -2100,12 +2124,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct extent_buffer *parent);
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
-                  struct btrfs_path *path,
-                  u64 root_id, u64 ref_id);
+                       struct btrfs_path *path,
+                       u64 root_id, u64 ref_id);
 int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
                       struct btrfs_root *tree_root,
-                      u64 root_id, u8 type, u64 ref_id,
-                      u64 dirid, u64 sequence,
+                      u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
+                      const char *name, int name_len);
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *tree_root,
+                      u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
                       const char *name, int name_len);
 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                   struct btrfs_key *key);
@@ -2120,6 +2147,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
 int btrfs_search_root(struct btrfs_root *root, u64 search_start,
                      u64 *found_objectid);
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
 int btrfs_set_root_node(struct btrfs_root_item *item,
                        struct extent_buffer *node);
 /* dir-item.c */
@@ -2138,6 +2166,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
                            struct btrfs_path *path, u64 dir,
                            u64 objectid, const char *name, int name_len,
                            int mod);
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+                           struct btrfs_path *path, u64 dirid,
+                           const char *name, int name_len);
 struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
                              struct btrfs_path *path,
                              const char *name, int name_len);
@@ -2160,6 +2192,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 offset);
 int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, u64 offset);
+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
 
 /* inode-map.c */
 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
@@ -2232,6 +2265,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 int btrfs_add_link(struct btrfs_trans_handle *trans,
                   struct inode *parent_inode, struct inode *inode,
                   const char *name, int name_len, int add_backref, u64 index);
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root,
+                       struct inode *dir, u64 objectid,
+                       const char *name, int name_len);
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               struct inode *inode, u64 new_size,
@@ -2242,7 +2279,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
 int btrfs_writepages(struct address_space *mapping,
                     struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *new_root, struct dentry *dentry,
+                            struct btrfs_root *new_root,
                             u64 new_dirid, u64 alloc_hint);
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
                         size_t size, struct bio *bio, unsigned long bio_flags);
@@ -2258,6 +2295,7 @@ int btrfs_write_inode(struct inode *inode, int wait);
 void btrfs_dirty_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
+void btrfs_drop_inode(struct inode *inode);
 int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
@@ -2275,6 +2313,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 void btrfs_orphan_cleanup(struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t size);
+int btrfs_invalidate_inodes(struct btrfs_root *root);
+extern struct dentry_operations btrfs_dentry_operations;
 
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -2290,7 +2330,7 @@ extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct inode *inode,
                       u64 start, u64 end, u64 locked_end,
-                      u64 inline_limit, u64 *hint_block);
+                      u64 inline_limit, u64 *hint_block, int drop_cache);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct inode *inode, u64 start, u64 end);
index 1d70236ba00c7bf4abf5181d2efd3b20d5867e2e..f3a6075519ccc1d96e42157f95769a5ad6641a12 100644 (file)
@@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
        return btrfs_match_dir_item_name(root, path, name, name_len);
 }
 
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+                           struct btrfs_path *path, u64 dirid,
+                           const char *name, int name_len)
+{
+       struct extent_buffer *leaf;
+       struct btrfs_dir_item *di;
+       struct btrfs_key key;
+       u32 nritems;
+       int ret;
+
+       key.objectid = dirid;
+       key.type = BTRFS_DIR_INDEX_KEY;
+       key.offset = 0;
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               return ERR_PTR(ret);
+
+       leaf = path->nodes[0];
+       nritems = btrfs_header_nritems(leaf);
+
+       while (1) {
+               if (path->slots[0] >= nritems) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0)
+                               return ERR_PTR(ret);
+                       if (ret > 0)
+                               break;
+                       leaf = path->nodes[0];
+                       nritems = btrfs_header_nritems(leaf);
+                       continue;
+               }
+
+               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+               if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
+                       break;
+
+               di = btrfs_match_dir_item_name(root, path, name, name_len);
+               if (di)
+                       return di;
+
+               path->slots[0]++;
+       }
+       return NULL;
+}
+
 struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
                                          struct btrfs_path *path, u64 dir,
index 6c4173146bb739060de46b6776b3f471250491ee..644e796fd643e045ca0b0ed057743b7eb5e88438 100644 (file)
@@ -41,6 +41,7 @@
 
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
+static void free_fs_root(struct btrfs_root *root);
 
 static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
 
@@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode,
        struct extent_map *em;
        int ret;
 
-       spin_lock(&em_tree->lock);
+       read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, start, len);
        if (em) {
                em->bdev =
                        BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
-               spin_unlock(&em_tree->lock);
+               read_unlock(&em_tree->lock);
                goto out;
        }
-       spin_unlock(&em_tree->lock);
+       read_unlock(&em_tree->lock);
 
        em = alloc_extent_map(GFP_NOFS);
        if (!em) {
@@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
        em->block_start = 0;
        em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
-       spin_lock(&em_tree->lock);
+       write_lock(&em_tree->lock);
        ret = add_extent_mapping(em_tree, em);
        if (ret == -EEXIST) {
                u64 failed_start = em->start;
@@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
                free_extent_map(em);
                em = NULL;
        }
-       spin_unlock(&em_tree->lock);
+       write_unlock(&em_tree->lock);
 
        if (ret)
                em = ERR_PTR(ret);
@@ -895,8 +896,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->fs_info = fs_info;
        root->objectid = objectid;
        root->last_trans = 0;
-       root->highest_inode = 0;
-       root->last_inode_alloc = 0;
+       root->highest_objectid = 0;
        root->name = NULL;
        root->in_sysfs = 0;
        root->inode_tree.rb_node = NULL;
@@ -952,14 +952,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
                     root, fs_info, objectid);
        ret = btrfs_find_last_root(tree_root, objectid,
                                   &root->root_item, &root->root_key);
+       if (ret > 0)
+               return -ENOENT;
        BUG_ON(ret);
 
        generation = btrfs_root_generation(&root->root_item);
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
-       root->commit_root = btrfs_root_node(root);
        BUG_ON(!root->node);
+       root->commit_root = btrfs_root_node(root);
        return 0;
 }
 
@@ -1095,7 +1097,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
        struct btrfs_fs_info *fs_info = tree_root->fs_info;
        struct btrfs_path *path;
        struct extent_buffer *l;
-       u64 highest_inode;
        u64 generation;
        u32 blocksize;
        int ret = 0;
@@ -1110,7 +1111,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
                        kfree(root);
                        return ERR_PTR(ret);
                }
-               goto insert;
+               goto out;
        }
 
        __setup_root(tree_root->nodesize, tree_root->leafsize,
@@ -1120,39 +1121,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
        path = btrfs_alloc_path();
        BUG_ON(!path);
        ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
-       if (ret != 0) {
-               if (ret > 0)
-                       ret = -ENOENT;
-               goto out;
+       if (ret == 0) {
+               l = path->nodes[0];
+               read_extent_buffer(l, &root->root_item,
+                               btrfs_item_ptr_offset(l, path->slots[0]),
+                               sizeof(root->root_item));
+               memcpy(&root->root_key, location, sizeof(*location));
        }
-       l = path->nodes[0];
-       read_extent_buffer(l, &root->root_item,
-              btrfs_item_ptr_offset(l, path->slots[0]),
-              sizeof(root->root_item));
-       memcpy(&root->root_key, location, sizeof(*location));
-       ret = 0;
-out:
-       btrfs_release_path(root, path);
        btrfs_free_path(path);
        if (ret) {
-               kfree(root);
+               if (ret > 0)
+                       ret = -ENOENT;
                return ERR_PTR(ret);
        }
+
        generation = btrfs_root_generation(&root->root_item);
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
        root->commit_root = btrfs_root_node(root);
        BUG_ON(!root->node);
-insert:
-       if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+out:
+       if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
                root->ref_cows = 1;
-               ret = btrfs_find_highest_inode(root, &highest_inode);
-               if (ret == 0) {
-                       root->highest_inode = highest_inode;
-                       root->last_inode_alloc = highest_inode;
-               }
-       }
+
        return root;
 }
 
@@ -1187,39 +1179,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
                return fs_info->dev_root;
        if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
                return fs_info->csum_root;
-
+again:
+       spin_lock(&fs_info->fs_roots_radix_lock);
        root = radix_tree_lookup(&fs_info->fs_roots_radix,
                                 (unsigned long)location->objectid);
+       spin_unlock(&fs_info->fs_roots_radix_lock);
        if (root)
                return root;
 
+       ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+       if (ret == 0)
+               ret = -ENOENT;
+       if (ret < 0)
+               return ERR_PTR(ret);
+
        root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
        if (IS_ERR(root))
                return root;
 
+       WARN_ON(btrfs_root_refs(&root->root_item) == 0);
        set_anon_super(&root->anon_super, NULL);
 
+       ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+       if (ret)
+               goto fail;
+
+       spin_lock(&fs_info->fs_roots_radix_lock);
        ret = radix_tree_insert(&fs_info->fs_roots_radix,
                                (unsigned long)root->root_key.objectid,
                                root);
+       if (ret == 0)
+               root->in_radix = 1;
+       spin_unlock(&fs_info->fs_roots_radix_lock);
+       radix_tree_preload_end();
        if (ret) {
-               free_extent_buffer(root->node);
-               kfree(root);
-               return ERR_PTR(ret);
+               if (ret == -EEXIST) {
+                       free_fs_root(root);
+                       goto again;
+               }
+               goto fail;
        }
-       if (!(fs_info->sb->s_flags & MS_RDONLY)) {
-               ret = btrfs_find_dead_roots(fs_info->tree_root,
-                                           root->root_key.objectid);
-               BUG_ON(ret);
+
+       ret = btrfs_find_dead_roots(fs_info->tree_root,
+                                   root->root_key.objectid);
+       WARN_ON(ret);
+
+       if (!(fs_info->sb->s_flags & MS_RDONLY))
                btrfs_orphan_cleanup(root);
-       }
+
        return root;
+fail:
+       free_fs_root(root);
+       return ERR_PTR(ret);
 }
 
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
                                      struct btrfs_key *location,
                                      const char *name, int namelen)
 {
+       return btrfs_read_fs_root_no_name(fs_info, location);
+#if 0
        struct btrfs_root *root;
        int ret;
 
@@ -1236,7 +1255,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
                kfree(root);
                return ERR_PTR(ret);
        }
-#if 0
+
        ret = btrfs_sysfs_add_root(root);
        if (ret) {
                free_extent_buffer(root->node);
@@ -1244,9 +1263,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
                kfree(root);
                return ERR_PTR(ret);
        }
-#endif
        root->in_sysfs = 1;
        return root;
+#endif
 }
 
 static int btrfs_congested_fn(void *congested_data, int bdi_bits)
@@ -1325,9 +1344,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
        offset = page_offset(page);
 
        em_tree = &BTRFS_I(inode)->extent_tree;
-       spin_lock(&em_tree->lock);
+       read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
-       spin_unlock(&em_tree->lock);
+       read_unlock(&em_tree->lock);
        if (!em) {
                __unplug_io_fn(bdi, page);
                return;
@@ -1360,8 +1379,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 
        err = bdi_register(bdi, NULL, "btrfs-%d",
                                atomic_inc_return(&btrfs_bdi_num));
-       if (err)
+       if (err) {
+               bdi_destroy(bdi);
                return err;
+       }
 
        bdi->ra_pages   = default_backing_dev_info.ra_pages;
        bdi->unplug_io_fn       = btrfs_unplug_io_fn;
@@ -1451,9 +1472,12 @@ static int cleaner_kthread(void *arg)
                        break;
 
                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
-               mutex_lock(&root->fs_info->cleaner_mutex);
-               btrfs_clean_old_snapshots(root);
-               mutex_unlock(&root->fs_info->cleaner_mutex);
+
+               if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
+                   mutex_trylock(&root->fs_info->cleaner_mutex)) {
+                       btrfs_clean_old_snapshots(root);
+                       mutex_unlock(&root->fs_info->cleaner_mutex);
+               }
 
                if (freezing(current)) {
                        refrigerator();
@@ -1558,15 +1582,36 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                err = -ENOMEM;
                goto fail;
        }
-       INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
+
+       ret = init_srcu_struct(&fs_info->subvol_srcu);
+       if (ret) {
+               err = ret;
+               goto fail;
+       }
+
+       ret = setup_bdi(fs_info, &fs_info->bdi);
+       if (ret) {
+               err = ret;
+               goto fail_srcu;
+       }
+
+       fs_info->btree_inode = new_inode(sb);
+       if (!fs_info->btree_inode) {
+               err = -ENOMEM;
+               goto fail_bdi;
+       }
+
+       INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
        INIT_LIST_HEAD(&fs_info->trans_list);
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->hashers);
        INIT_LIST_HEAD(&fs_info->delalloc_inodes);
        INIT_LIST_HEAD(&fs_info->ordered_operations);
+       INIT_LIST_HEAD(&fs_info->caching_block_groups);
        spin_lock_init(&fs_info->delalloc_lock);
        spin_lock_init(&fs_info->new_trans_lock);
        spin_lock_init(&fs_info->ref_cache_lock);
+       spin_lock_init(&fs_info->fs_roots_radix_lock);
 
        init_completion(&fs_info->kobj_unregister);
        fs_info->tree_root = tree_root;
@@ -1585,11 +1630,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->sb = sb;
        fs_info->max_extent = (u64)-1;
        fs_info->max_inline = 8192 * 1024;
-       if (setup_bdi(fs_info, &fs_info->bdi))
-               goto fail_bdi;
-       fs_info->btree_inode = new_inode(sb);
-       fs_info->btree_inode->i_ino = 1;
-       fs_info->btree_inode->i_nlink = 1;
        fs_info->metadata_ratio = 8;
 
        fs_info->thread_pool_size = min_t(unsigned long,
@@ -1602,6 +1642,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        sb->s_blocksize_bits = blksize_bits(4096);
        sb->s_bdi = &fs_info->bdi;
 
+       fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+       fs_info->btree_inode->i_nlink = 1;
        /*
         * we set the i_size on the btree inode to the max possible int.
         * the real end of the address space is determined by all of
@@ -1620,28 +1662,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
        BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
 
+       BTRFS_I(fs_info->btree_inode)->root = tree_root;
+       memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+              sizeof(struct btrfs_key));
+       BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
+       insert_inode_hash(fs_info->btree_inode);
+
        spin_lock_init(&fs_info->block_group_cache_lock);
        fs_info->block_group_cache_tree.rb_node = NULL;
 
-       extent_io_tree_init(&fs_info->pinned_extents,
+       extent_io_tree_init(&fs_info->freed_extents[0],
                             fs_info->btree_inode->i_mapping, GFP_NOFS);
+       extent_io_tree_init(&fs_info->freed_extents[1],
+                            fs_info->btree_inode->i_mapping, GFP_NOFS);
+       fs_info->pinned_extents = &fs_info->freed_extents[0];
        fs_info->do_barriers = 1;
 
-       BTRFS_I(fs_info->btree_inode)->root = tree_root;
-       memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
-              sizeof(struct btrfs_key));
-       insert_inode_hash(fs_info->btree_inode);
 
        mutex_init(&fs_info->trans_mutex);
        mutex_init(&fs_info->ordered_operations_mutex);
        mutex_init(&fs_info->tree_log_mutex);
-       mutex_init(&fs_info->drop_mutex);
        mutex_init(&fs_info->chunk_mutex);
        mutex_init(&fs_info->transaction_kthread_mutex);
        mutex_init(&fs_info->cleaner_mutex);
        mutex_init(&fs_info->volume_mutex);
-       mutex_init(&fs_info->tree_reloc_mutex);
        init_rwsem(&fs_info->extent_commit_sem);
+       init_rwsem(&fs_info->subvol_sem);
 
        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
        btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -1700,7 +1746,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                err = -EINVAL;
                goto fail_iput;
        }
-
+printk("thread pool is %d\n", fs_info->thread_pool_size);
        /*
         * we need to start all the end_io workers up front because the
         * queue work function gets called at interrupt time, and so it
@@ -1745,20 +1791,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->endio_workers.idle_thresh = 4;
        fs_info->endio_meta_workers.idle_thresh = 4;
 
-       fs_info->endio_write_workers.idle_thresh = 64;
-       fs_info->endio_meta_write_workers.idle_thresh = 64;
+       fs_info->endio_write_workers.idle_thresh = 2;
+       fs_info->endio_meta_write_workers.idle_thresh = 2;
+
+       fs_info->endio_workers.atomic_worker_start = 1;
+       fs_info->endio_meta_workers.atomic_worker_start = 1;
+       fs_info->endio_write_workers.atomic_worker_start = 1;
+       fs_info->endio_meta_write_workers.atomic_worker_start = 1;
 
        btrfs_start_workers(&fs_info->workers, 1);
        btrfs_start_workers(&fs_info->submit_workers, 1);
        btrfs_start_workers(&fs_info->delalloc_workers, 1);
        btrfs_start_workers(&fs_info->fixup_workers, 1);
-       btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
-       btrfs_start_workers(&fs_info->endio_meta_workers,
-                           fs_info->thread_pool_size);
-       btrfs_start_workers(&fs_info->endio_meta_write_workers,
-                           fs_info->thread_pool_size);
-       btrfs_start_workers(&fs_info->endio_write_workers,
-                           fs_info->thread_pool_size);
+       btrfs_start_workers(&fs_info->endio_workers, 1);
+       btrfs_start_workers(&fs_info->endio_meta_workers, 1);
+       btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
+       btrfs_start_workers(&fs_info->endio_write_workers, 1);
 
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1918,6 +1966,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                }
        }
 
+       ret = btrfs_find_orphan_roots(tree_root);
+       BUG_ON(ret);
+
        if (!(sb->s_flags & MS_RDONLY)) {
                ret = btrfs_recover_relocation(tree_root);
                BUG_ON(ret);
@@ -1977,6 +2028,8 @@ fail_iput:
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
 fail_bdi:
        bdi_destroy(&fs_info->bdi);
+fail_srcu:
+       cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
        kfree(extent_root);
        kfree(tree_root);
@@ -2236,20 +2289,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 {
-       WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+       spin_lock(&fs_info->fs_roots_radix_lock);
        radix_tree_delete(&fs_info->fs_roots_radix,
                          (unsigned long)root->root_key.objectid);
+       spin_unlock(&fs_info->fs_roots_radix_lock);
+
+       if (btrfs_root_refs(&root->root_item) == 0)
+               synchronize_srcu(&fs_info->subvol_srcu);
+
+       free_fs_root(root);
+       return 0;
+}
+
+static void free_fs_root(struct btrfs_root *root)
+{
+       WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
        if (root->anon_super.s_dev) {
                down_write(&root->anon_super.s_umount);
                kill_anon_super(&root->anon_super);
        }
-       if (root->node)
-               free_extent_buffer(root->node);
-       if (root->commit_root)
-               free_extent_buffer(root->commit_root);
+       free_extent_buffer(root->node);
+       free_extent_buffer(root->commit_root);
        kfree(root->name);
        kfree(root);
-       return 0;
 }
 
 static int del_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2258,6 +2320,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
        struct btrfs_root *gang[8];
        int i;
 
+       while (!list_empty(&fs_info->dead_roots)) {
+               gang[0] = list_entry(fs_info->dead_roots.next,
+                                    struct btrfs_root, root_list);
+               list_del(&gang[0]->root_list);
+
+               if (gang[0]->in_radix) {
+                       btrfs_free_fs_root(fs_info, gang[0]);
+               } else {
+                       free_extent_buffer(gang[0]->node);
+                       free_extent_buffer(gang[0]->commit_root);
+                       kfree(gang[0]);
+               }
+       }
+
        while (1) {
                ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
                                             (void **)gang, 0,
@@ -2287,9 +2363,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
                root_objectid = gang[ret - 1]->root_key.objectid + 1;
                for (i = 0; i < ret; i++) {
                        root_objectid = gang[i]->root_key.objectid;
-                       ret = btrfs_find_dead_roots(fs_info->tree_root,
-                                                   root_objectid);
-                       BUG_ON(ret);
                        btrfs_orphan_cleanup(gang[i]);
                }
                root_objectid++;
@@ -2359,7 +2432,6 @@ int close_ctree(struct btrfs_root *root)
        free_extent_buffer(root->fs_info->csum_root->commit_root);
 
        btrfs_free_block_groups(root->fs_info);
-       btrfs_free_pinned_extents(root->fs_info);
 
        del_fs_roots(fs_info);
 
@@ -2378,6 +2450,7 @@ int close_ctree(struct btrfs_root *root)
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
        bdi_destroy(&fs_info->bdi);
+       cleanup_srcu_struct(&fs_info->subvol_srcu);
 
        kfree(fs_info->extent_root);
        kfree(fs_info->tree_root);
index 9596b40caa4ea3dd307405a68153a49a3f143483..ba5c3fd5ab8c89e3057aa612f1bd929c1589b2b3 100644 (file)
@@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
        len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
        type = FILEID_BTRFS_WITHOUT_PARENT;
 
-       fid->objectid = BTRFS_I(inode)->location.objectid;
+       fid->objectid = inode->i_ino;
        fid->root_objectid = BTRFS_I(inode)->root->objectid;
        fid->gen = inode->i_generation;
 
@@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
 }
 
 static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
-                                      u64 root_objectid, u32 generation)
+                                      u64 root_objectid, u32 generation,
+                                      int check_generation)
 {
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
        struct btrfs_root *root;
+       struct dentry *dentry;
        struct inode *inode;
        struct btrfs_key key;
+       int index;
+       int err = 0;
+
+       if (objectid < BTRFS_FIRST_FREE_OBJECTID)
+               return ERR_PTR(-ESTALE);
 
        key.objectid = root_objectid;
        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
        key.offset = (u64)-1;
 
-       root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
-       if (IS_ERR(root))
-               return ERR_CAST(root);
+       index = srcu_read_lock(&fs_info->subvol_srcu);
+
+       root = btrfs_read_fs_root_no_name(fs_info, &key);
+       if (IS_ERR(root)) {
+               err = PTR_ERR(root);
+               goto fail;
+       }
+
+       if (btrfs_root_refs(&root->root_item) == 0) {
+               err = -ENOENT;
+               goto fail;
+       }
 
        key.objectid = objectid;
        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
        key.offset = 0;
 
        inode = btrfs_iget(sb, &key, root);
-       if (IS_ERR(inode))
-               return (void *)inode;
+       if (IS_ERR(inode)) {
+               err = PTR_ERR(inode);
+               goto fail;
+       }
+
+       srcu_read_unlock(&fs_info->subvol_srcu, index);
 
-       if (generation != inode->i_generation) {
+       if (check_generation && generation != inode->i_generation) {
                iput(inode);
                return ERR_PTR(-ESTALE);
        }
 
-       return d_obtain_alias(inode);
+       dentry = d_obtain_alias(inode);
+       if (!IS_ERR(dentry))
+               dentry->d_op = &btrfs_dentry_operations;
+       return dentry;
+fail:
+       srcu_read_unlock(&fs_info->subvol_srcu, index);
+       return ERR_PTR(err);
 }
 
 static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
@@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
        objectid = fid->parent_objectid;
        generation = fid->parent_gen;
 
-       return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+       return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
 }
 
 static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
@@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
        root_objectid = fid->root_objectid;
        generation = fid->gen;
 
-       return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+       return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
 }
 
 static struct dentry *btrfs_get_parent(struct dentry *child)
 {
        struct inode *dir = child->d_inode;
+       static struct dentry *dentry;
        struct btrfs_root *root = BTRFS_I(dir)->root;
-       struct btrfs_key key;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
-       int slot;
-       u64 objectid;
+       struct btrfs_root_ref *ref;
+       struct btrfs_key key;
+       struct btrfs_key found_key;
        int ret;
 
        path = btrfs_alloc_path();
 
-       key.objectid = dir->i_ino;
-       btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
-       key.offset = (u64)-1;
+       if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               key.objectid = root->root_key.objectid;
+               key.type = BTRFS_ROOT_BACKREF_KEY;
+               key.offset = (u64)-1;
+               root = root->fs_info->tree_root;
+       } else {
+               key.objectid = dir->i_ino;
+               key.type = BTRFS_INODE_REF_KEY;
+               key.offset = (u64)-1;
+       }
 
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-       if (ret < 0) {
-               /* Error */
-               btrfs_free_path(path);
-               return ERR_PTR(ret);
+       if (ret < 0)
+               goto fail;
+
+       BUG_ON(ret == 0);
+       if (path->slots[0] == 0) {
+               ret = -ENOENT;
+               goto fail;
        }
+
+       path->slots[0]--;
        leaf = path->nodes[0];
-       slot = path->slots[0];
-       if (ret) {
-               /* btrfs_search_slot() returns the slot where we'd want to
-                  insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
-                  The _real_ backref, telling us what the parent inode
-                  _actually_ is, will be in the slot _before_ the one
-                  that btrfs_search_slot() returns. */
-               if (!slot) {
-                       /* Unless there is _no_ key in the tree before... */
-                       btrfs_free_path(path);
-                       return ERR_PTR(-EIO);
-               }
-               slot--;
+
+       btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+       if (found_key.objectid != key.objectid || found_key.type != key.type) {
+               ret = -ENOENT;
+               goto fail;
        }
 
-       btrfs_item_key_to_cpu(leaf, &key, slot);
+       if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+               ref = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_root_ref);
+               key.objectid = btrfs_root_ref_dirid(leaf, ref);
+       } else {
+               key.objectid = found_key.offset;
+       }
        btrfs_free_path(path);
 
-       if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
-               return ERR_PTR(-EINVAL);
-
-       objectid = key.offset;
-
-       /* If we are already at the root of a subvol, return the real root */
-       if (objectid == dir->i_ino)
-               return dget(dir->i_sb->s_root);
+       if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+               return btrfs_get_dentry(root->fs_info->sb, key.objectid,
+                                       found_key.offset, 0, 0);
+       }
 
-       /* Build a new key for the inode item */
-       key.objectid = objectid;
-       btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+       key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
-
-       return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
+       dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
+       if (!IS_ERR(dentry))
+               dentry->d_op = &btrfs_dentry_operations;
+       return dentry;
+fail:
+       btrfs_free_path(path);
+       return ERR_PTR(ret);
 }
 
 const struct export_operations btrfs_export_ops = {
index 535f85ba104f41fea1c28820533da7e1af1512c3..993f93ff7ba695c97b490f5e2d5d6d3b5939980c 100644 (file)
 #include "locking.h"
 #include "free-space-cache.h"
 
-static int update_reserved_extents(struct btrfs_root *root,
-                                  u64 bytenr, u64 num, int reserve);
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc,
                              int mark_free);
+static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+                                  u64 num_bytes, int reserve);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 parent,
@@ -57,10 +57,17 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                                     u64 parent, u64 root_objectid,
                                     u64 flags, struct btrfs_disk_key *key,
                                     int level, struct btrfs_key *ins);
-
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force);
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         struct btrfs_path *path,
+                         u64 bytenr, u64 num_bytes,
+                         int is_data, int reserved,
+                         struct extent_buffer **must_clean);
+static int find_next_key(struct btrfs_path *path, int level,
+                        struct btrfs_key *key);
 
 static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -153,34 +160,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
        return ret;
 }
 
-/*
- * We always set EXTENT_LOCKED for the super mirror extents so we don't
- * overwrite them, so those bits need to be unset.  Also, if we are unmounting
- * with pinned extents still sitting there because we had a block group caching,
- * we need to clear those now, since we are done.
- */
-void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
+static int add_excluded_extent(struct btrfs_root *root,
+                              u64 start, u64 num_bytes)
 {
-       u64 start, end, last = 0;
-       int ret;
+       u64 end = start + num_bytes - 1;
+       set_extent_bits(&root->fs_info->freed_extents[0],
+                       start, end, EXTENT_UPTODATE, GFP_NOFS);
+       set_extent_bits(&root->fs_info->freed_extents[1],
+                       start, end, EXTENT_UPTODATE, GFP_NOFS);
+       return 0;
+}
 
-       while (1) {
-               ret = find_first_extent_bit(&info->pinned_extents, last,
-                                           &start, &end,
-                                           EXTENT_LOCKED|EXTENT_DIRTY);
-               if (ret)
-                       break;
+static void free_excluded_extents(struct btrfs_root *root,
+                                 struct btrfs_block_group_cache *cache)
+{
+       u64 start, end;
 
-               clear_extent_bits(&info->pinned_extents, start, end,
-                                 EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
-               last = end+1;
-       }
+       start = cache->key.objectid;
+       end = start + cache->key.offset - 1;
+
+       clear_extent_bits(&root->fs_info->freed_extents[0],
+                         start, end, EXTENT_UPTODATE, GFP_NOFS);
+       clear_extent_bits(&root->fs_info->freed_extents[1],
+                         start, end, EXTENT_UPTODATE, GFP_NOFS);
 }
 
-static int remove_sb_from_cache(struct btrfs_root *root,
-                               struct btrfs_block_group_cache *cache)
+static int exclude_super_stripes(struct btrfs_root *root,
+                                struct btrfs_block_group_cache *cache)
 {
-       struct btrfs_fs_info *fs_info = root->fs_info;
        u64 bytenr;
        u64 *logical;
        int stripe_len;
@@ -192,17 +199,42 @@ static int remove_sb_from_cache(struct btrfs_root *root,
                                       cache->key.objectid, bytenr,
                                       0, &logical, &nr, &stripe_len);
                BUG_ON(ret);
+
                while (nr--) {
-                       try_lock_extent(&fs_info->pinned_extents,
-                                       logical[nr],
-                                       logical[nr] + stripe_len - 1, GFP_NOFS);
+                       cache->bytes_super += stripe_len;
+                       ret = add_excluded_extent(root, logical[nr],
+                                                 stripe_len);
+                       BUG_ON(ret);
                }
+
                kfree(logical);
        }
-
        return 0;
 }
 
+static struct btrfs_caching_control *
+get_caching_control(struct btrfs_block_group_cache *cache)
+{
+       struct btrfs_caching_control *ctl;
+
+       spin_lock(&cache->lock);
+       if (cache->cached != BTRFS_CACHE_STARTED) {
+               spin_unlock(&cache->lock);
+               return NULL;
+       }
+
+       ctl = cache->caching_ctl;
+       atomic_inc(&ctl->count);
+       spin_unlock(&cache->lock);
+       return ctl;
+}
+
+static void put_caching_control(struct btrfs_caching_control *ctl)
+{
+       if (atomic_dec_and_test(&ctl->count))
+               kfree(ctl);
+}
+
 /*
  * this is only called by cache_block_group, since we could have freed extents
  * we need to check the pinned_extents for any extents that can't be used yet
@@ -215,9 +247,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
        int ret;
 
        while (start < end) {
-               ret = find_first_extent_bit(&info->pinned_extents, start,
+               ret = find_first_extent_bit(info->pinned_extents, start,
                                            &extent_start, &extent_end,
-                                           EXTENT_DIRTY|EXTENT_LOCKED);
+                                           EXTENT_DIRTY | EXTENT_UPTODATE);
                if (ret)
                        break;
 
@@ -249,22 +281,27 @@ static int caching_kthread(void *data)
 {
        struct btrfs_block_group_cache *block_group = data;
        struct btrfs_fs_info *fs_info = block_group->fs_info;
-       u64 last = 0;
+       struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
+       struct btrfs_root *extent_root = fs_info->extent_root;
        struct btrfs_path *path;
-       int ret = 0;
-       struct btrfs_key key;
        struct extent_buffer *leaf;
-       int slot;
+       struct btrfs_key key;
        u64 total_found = 0;
-
-       BUG_ON(!fs_info);
+       u64 last = 0;
+       u32 nritems;
+       int ret = 0;
 
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
 
-       atomic_inc(&block_group->space_info->caching_threads);
+       exclude_super_stripes(extent_root, block_group);
+       spin_lock(&block_group->space_info->lock);
+       block_group->space_info->bytes_super += block_group->bytes_super;
+       spin_unlock(&block_group->space_info->lock);
+
        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+
        /*
         * We don't want to deadlock with somebody trying to allocate a new
         * extent for the extent root while also trying to search the extent
@@ -277,74 +314,64 @@ static int caching_kthread(void *data)
 
        key.objectid = last;
        key.offset = 0;
-       btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+       key.type = BTRFS_EXTENT_ITEM_KEY;
 again:
+       mutex_lock(&caching_ctl->mutex);
        /* need to make sure the commit_root doesn't disappear */
        down_read(&fs_info->extent_commit_sem);
 
-       ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+       ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto err;
 
+       leaf = path->nodes[0];
+       nritems = btrfs_header_nritems(leaf);
+
        while (1) {
                smp_mb();
-               if (block_group->fs_info->closing > 1) {
+               if (fs_info->closing > 1) {
                        last = (u64)-1;
                        break;
                }
 
-               leaf = path->nodes[0];
-               slot = path->slots[0];
-               if (slot >= btrfs_header_nritems(leaf)) {
-                       ret = btrfs_next_leaf(fs_info->extent_root, path);
-                       if (ret < 0)
-                               goto err;
-                       else if (ret)
+               if (path->slots[0] < nritems) {
+                       btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+               } else {
+                       ret = find_next_key(path, 0, &key);
+                       if (ret)
                                break;
 
-                       if (need_resched() ||
-                           btrfs_transaction_in_commit(fs_info)) {
-                               leaf = path->nodes[0];
-
-                               /* this shouldn't happen, but if the
-                                * leaf is empty just move on.
-                                */
-                               if (btrfs_header_nritems(leaf) == 0)
-                                       break;
-                               /*
-                                * we need to copy the key out so that
-                                * we are sure the next search advances
-                                * us forward in the btree.
-                                */
-                               btrfs_item_key_to_cpu(leaf, &key, 0);
-                               btrfs_release_path(fs_info->extent_root, path);
-                               up_read(&fs_info->extent_commit_sem);
+                       caching_ctl->progress = last;
+                       btrfs_release_path(extent_root, path);
+                       up_read(&fs_info->extent_commit_sem);
+                       mutex_unlock(&caching_ctl->mutex);
+                       if (btrfs_transaction_in_commit(fs_info))
                                schedule_timeout(1);
-                               goto again;
-                       }
+                       else
+                               cond_resched();
+                       goto again;
+               }
 
+               if (key.objectid < block_group->key.objectid) {
+                       path->slots[0]++;
                        continue;
                }
-               btrfs_item_key_to_cpu(leaf, &key, slot);
-               if (key.objectid < block_group->key.objectid)
-                       goto next;
 
                if (key.objectid >= block_group->key.objectid +
                    block_group->key.offset)
                        break;
 
-               if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
+               if (key.type == BTRFS_EXTENT_ITEM_KEY) {
                        total_found += add_new_free_space(block_group,
                                                          fs_info, last,
                                                          key.objectid);
                        last = key.objectid + key.offset;
-               }
 
-               if (total_found > (1024 * 1024 * 2)) {
-                       total_found = 0;
-                       wake_up(&block_group->caching_q);
+                       if (total_found > (1024 * 1024 * 2)) {
+                               total_found = 0;
+                               wake_up(&caching_ctl->wait);
+                       }
                }
-next:
                path->slots[0]++;
        }
        ret = 0;
@@ -352,33 +379,65 @@ next:
        total_found += add_new_free_space(block_group, fs_info, last,
                                          block_group->key.objectid +
                                          block_group->key.offset);
+       caching_ctl->progress = (u64)-1;
 
        spin_lock(&block_group->lock);
+       block_group->caching_ctl = NULL;
        block_group->cached = BTRFS_CACHE_FINISHED;
        spin_unlock(&block_group->lock);
 
 err:
        btrfs_free_path(path);
        up_read(&fs_info->extent_commit_sem);
-       atomic_dec(&block_group->space_info->caching_threads);
-       wake_up(&block_group->caching_q);
 
+       free_excluded_extents(extent_root, block_group);
+
+       mutex_unlock(&caching_ctl->mutex);
+       wake_up(&caching_ctl->wait);
+
+       put_caching_control(caching_ctl);
+       atomic_dec(&block_group->space_info->caching_threads);
        return 0;
 }
 
 static int cache_block_group(struct btrfs_block_group_cache *cache)
 {
+       struct btrfs_fs_info *fs_info = cache->fs_info;
+       struct btrfs_caching_control *caching_ctl;
        struct task_struct *tsk;
        int ret = 0;
 
+       smp_mb();
+       if (cache->cached != BTRFS_CACHE_NO)
+               return 0;
+
+       caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
+       BUG_ON(!caching_ctl);
+
+       INIT_LIST_HEAD(&caching_ctl->list);
+       mutex_init(&caching_ctl->mutex);
+       init_waitqueue_head(&caching_ctl->wait);
+       caching_ctl->block_group = cache;
+       caching_ctl->progress = cache->key.objectid;
+       /* one for caching kthread, one for caching block group list */
+       atomic_set(&caching_ctl->count, 2);
+
        spin_lock(&cache->lock);
        if (cache->cached != BTRFS_CACHE_NO) {
                spin_unlock(&cache->lock);
-               return ret;
+               kfree(caching_ctl);
+               return 0;
        }
+       cache->caching_ctl = caching_ctl;
        cache->cached = BTRFS_CACHE_STARTED;
        spin_unlock(&cache->lock);
 
+       down_write(&fs_info->extent_commit_sem);
+       list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
+       up_write(&fs_info->extent_commit_sem);
+
+       atomic_inc(&cache->space_info->caching_threads);
+
        tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
                          cache->key.objectid);
        if (IS_ERR(tsk)) {
@@ -1657,7 +1716,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
                                                 parent, ref_root, flags,
                                                 ref->objectid, ref->offset,
                                                 &ins, node->ref_mod);
-               update_reserved_extents(root, ins.objectid, ins.offset, 0);
        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
                                             node->num_bytes, parent,
@@ -1783,7 +1841,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
                                                extent_op->flags_to_set,
                                                &extent_op->key,
                                                ref->level, &ins);
-               update_reserved_extents(root, ins.objectid, ins.offset, 0);
        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
                                             node->num_bytes, parent, ref_root,
@@ -1818,16 +1875,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                BUG_ON(extent_op);
                head = btrfs_delayed_node_to_head(node);
                if (insert_reserved) {
+                       int mark_free = 0;
+                       struct extent_buffer *must_clean = NULL;
+
+                       ret = pin_down_bytes(trans, root, NULL,
+                                            node->bytenr, node->num_bytes,
+                                            head->is_data, 1, &must_clean);
+                       if (ret > 0)
+                               mark_free = 1;
+
+                       if (must_clean) {
+                               clean_tree_block(NULL, root, must_clean);
+                               btrfs_tree_unlock(must_clean);
+                               free_extent_buffer(must_clean);
+                       }
                        if (head->is_data) {
                                ret = btrfs_del_csums(trans, root,
                                                      node->bytenr,
                                                      node->num_bytes);
                                BUG_ON(ret);
                        }
-                       btrfs_update_pinned_extents(root, node->bytenr,
-                                                   node->num_bytes, 1);
-                       update_reserved_extents(root, node->bytenr,
-                                               node->num_bytes, 0);
+                       if (mark_free) {
+                               ret = btrfs_free_reserved_extent(root,
+                                                       node->bytenr,
+                                                       node->num_bytes);
+                               BUG_ON(ret);
+                       }
                }
                mutex_unlock(&head->mutex);
                return 0;
@@ -2706,6 +2779,8 @@ int btrfs_check_metadata_free_space(struct btrfs_root *root)
        /* get the space info for where the metadata will live */
        alloc_target = btrfs_get_alloc_profile(root, 0);
        meta_sinfo = __find_space_info(info, alloc_target);
+       if (!meta_sinfo)
+               goto alloc;
 
 again:
        spin_lock(&meta_sinfo->lock);
@@ -2717,12 +2792,13 @@ again:
        do_div(thresh, 100);
 
        if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-           meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
+           meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+           meta_sinfo->bytes_super > thresh) {
                struct btrfs_trans_handle *trans;
                if (!meta_sinfo->full) {
                        meta_sinfo->force_alloc = 1;
                        spin_unlock(&meta_sinfo->lock);
-
+alloc:
                        trans = btrfs_start_transaction(root, 1);
                        if (!trans)
                                return -ENOMEM;
@@ -2730,6 +2806,10 @@ again:
                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
                                             2 * 1024 * 1024, alloc_target, 0);
                        btrfs_end_transaction(trans, root);
+                       if (!meta_sinfo) {
+                               meta_sinfo = __find_space_info(info,
+                                                              alloc_target);
+                       }
                        goto again;
                }
                spin_unlock(&meta_sinfo->lock);
@@ -2765,13 +2845,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 
        data_sinfo = BTRFS_I(inode)->space_info;
+       if (!data_sinfo)
+               goto alloc;
+
 again:
        /* make sure we have enough space to handle the data first */
        spin_lock(&data_sinfo->lock);
        if (data_sinfo->total_bytes - data_sinfo->bytes_used -
            data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
            data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
-           data_sinfo->bytes_may_use < bytes) {
+           data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
                struct btrfs_trans_handle *trans;
 
                /*
@@ -2783,7 +2866,7 @@ again:
 
                        data_sinfo->force_alloc = 1;
                        spin_unlock(&data_sinfo->lock);
-
+alloc:
                        alloc_target = btrfs_get_alloc_profile(root, 1);
                        trans = btrfs_start_transaction(root, 1);
                        if (!trans)
@@ -2795,6 +2878,11 @@ again:
                        btrfs_end_transaction(trans, root);
                        if (ret)
                                return ret;
+
+                       if (!data_sinfo) {
+                               btrfs_set_inode_space_info(root, inode);
+                               data_sinfo = BTRFS_I(inode)->space_info;
+                       }
                        goto again;
                }
                spin_unlock(&data_sinfo->lock);
@@ -3009,10 +3097,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                num_bytes = min(total, cache->key.offset - byte_in_group);
                if (alloc) {
                        old_val += num_bytes;
+                       btrfs_set_block_group_used(&cache->item, old_val);
+                       cache->reserved -= num_bytes;
                        cache->space_info->bytes_used += num_bytes;
+                       cache->space_info->bytes_reserved -= num_bytes;
                        if (cache->ro)
                                cache->space_info->bytes_readonly -= num_bytes;
-                       btrfs_set_block_group_used(&cache->item, old_val);
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
                } else {
@@ -3057,127 +3147,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
        return bytenr;
 }
 
-int btrfs_update_pinned_extents(struct btrfs_root *root,
-                               u64 bytenr, u64 num, int pin)
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent(struct btrfs_root *root,
+                    u64 bytenr, u64 num_bytes, int reserved)
 {
-       u64 len;
-       struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_block_group_cache *cache;
 
-       if (pin)
-               set_extent_dirty(&fs_info->pinned_extents,
-                               bytenr, bytenr + num - 1, GFP_NOFS);
-
-       while (num > 0) {
-               cache = btrfs_lookup_block_group(fs_info, bytenr);
-               BUG_ON(!cache);
-               len = min(num, cache->key.offset -
-                         (bytenr - cache->key.objectid));
-               if (pin) {
-                       spin_lock(&cache->space_info->lock);
-                       spin_lock(&cache->lock);
-                       cache->pinned += len;
-                       cache->space_info->bytes_pinned += len;
-                       spin_unlock(&cache->lock);
-                       spin_unlock(&cache->space_info->lock);
-                       fs_info->total_pinned += len;
-               } else {
-                       int unpin = 0;
+       cache = btrfs_lookup_block_group(fs_info, bytenr);
+       BUG_ON(!cache);
 
-                       /*
-                        * in order to not race with the block group caching, we
-                        * only want to unpin the extent if we are cached.  If
-                        * we aren't cached, we want to start async caching this
-                        * block group so we can free the extent the next time
-                        * around.
-                        */
-                       spin_lock(&cache->space_info->lock);
-                       spin_lock(&cache->lock);
-                       unpin = (cache->cached == BTRFS_CACHE_FINISHED);
-                       if (likely(unpin)) {
-                               cache->pinned -= len;
-                               cache->space_info->bytes_pinned -= len;
-                               fs_info->total_pinned -= len;
-                       }
-                       spin_unlock(&cache->lock);
-                       spin_unlock(&cache->space_info->lock);
+       spin_lock(&cache->space_info->lock);
+       spin_lock(&cache->lock);
+       cache->pinned += num_bytes;
+       cache->space_info->bytes_pinned += num_bytes;
+       if (reserved) {
+               cache->reserved -= num_bytes;
+               cache->space_info->bytes_reserved -= num_bytes;
+       }
+       spin_unlock(&cache->lock);
+       spin_unlock(&cache->space_info->lock);
 
-                       if (likely(unpin))
-                               clear_extent_dirty(&fs_info->pinned_extents,
-                                                  bytenr, bytenr + len -1,
-                                                  GFP_NOFS);
-                       else
-                               cache_block_group(cache);
+       btrfs_put_block_group(cache);
 
-                       if (unpin)
-                               btrfs_add_free_space(cache, bytenr, len);
-               }
-               btrfs_put_block_group(cache);
-               bytenr += len;
-               num -= len;
+       set_extent_dirty(fs_info->pinned_extents,
+                        bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+       return 0;
+}
+
+static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+                                  u64 num_bytes, int reserve)
+{
+       spin_lock(&cache->space_info->lock);
+       spin_lock(&cache->lock);
+       if (reserve) {
+               cache->reserved += num_bytes;
+               cache->space_info->bytes_reserved += num_bytes;
+       } else {
+               cache->reserved -= num_bytes;
+               cache->space_info->bytes_reserved -= num_bytes;
        }
+       spin_unlock(&cache->lock);
+       spin_unlock(&cache->space_info->lock);
        return 0;
 }
 
-static int update_reserved_extents(struct btrfs_root *root,
-                                  u64 bytenr, u64 num, int reserve)
+int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root)
 {
-       u64 len;
-       struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_caching_control *next;
+       struct btrfs_caching_control *caching_ctl;
+       struct btrfs_block_group_cache *cache;
 
-       while (num > 0) {
-               cache = btrfs_lookup_block_group(fs_info, bytenr);
-               BUG_ON(!cache);
-               len = min(num, cache->key.offset -
-                         (bytenr - cache->key.objectid));
+       down_write(&fs_info->extent_commit_sem);
 
-               spin_lock(&cache->space_info->lock);
-               spin_lock(&cache->lock);
-               if (reserve) {
-                       cache->reserved += len;
-                       cache->space_info->bytes_reserved += len;
+       list_for_each_entry_safe(caching_ctl, next,
+                                &fs_info->caching_block_groups, list) {
+               cache = caching_ctl->block_group;
+               if (block_group_cache_done(cache)) {
+                       cache->last_byte_to_unpin = (u64)-1;
+                       list_del_init(&caching_ctl->list);
+                       put_caching_control(caching_ctl);
                } else {
-                       cache->reserved -= len;
-                       cache->space_info->bytes_reserved -= len;
+                       cache->last_byte_to_unpin = caching_ctl->progress;
                }
-               spin_unlock(&cache->lock);
-               spin_unlock(&cache->space_info->lock);
-               btrfs_put_block_group(cache);
-               bytenr += len;
-               num -= len;
        }
+
+       if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+               fs_info->pinned_extents = &fs_info->freed_extents[1];
+       else
+               fs_info->pinned_extents = &fs_info->freed_extents[0];
+
+       up_write(&fs_info->extent_commit_sem);
        return 0;
 }
 
-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
+static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 {
-       u64 last = 0;
-       u64 start;
-       u64 end;
-       struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
-       int ret;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_block_group_cache *cache = NULL;
+       u64 len;
 
-       while (1) {
-               ret = find_first_extent_bit(pinned_extents, last,
-                                           &start, &end, EXTENT_DIRTY);
-               if (ret)
-                       break;
+       while (start <= end) {
+               if (!cache ||
+                   start >= cache->key.objectid + cache->key.offset) {
+                       if (cache)
+                               btrfs_put_block_group(cache);
+                       cache = btrfs_lookup_block_group(fs_info, start);
+                       BUG_ON(!cache);
+               }
+
+               len = cache->key.objectid + cache->key.offset - start;
+               len = min(len, end + 1 - start);
+
+               if (start < cache->last_byte_to_unpin) {
+                       len = min(len, cache->last_byte_to_unpin - start);
+                       btrfs_add_free_space(cache, start, len);
+               }
+
+               spin_lock(&cache->space_info->lock);
+               spin_lock(&cache->lock);
+               cache->pinned -= len;
+               cache->space_info->bytes_pinned -= len;
+               spin_unlock(&cache->lock);
+               spin_unlock(&cache->space_info->lock);
 
-               set_extent_dirty(copy, start, end, GFP_NOFS);
-               last = end + 1;
+               start += len;
        }
+
+       if (cache)
+               btrfs_put_block_group(cache);
        return 0;
 }
 
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root,
-                              struct extent_io_tree *unpin)
+                              struct btrfs_root *root)
 {
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct extent_io_tree *unpin;
        u64 start;
        u64 end;
        int ret;
 
+       if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+               unpin = &fs_info->freed_extents[1];
+       else
+               unpin = &fs_info->freed_extents[0];
+
        while (1) {
                ret = find_first_extent_bit(unpin, 0, &start, &end,
                                            EXTENT_DIRTY);
@@ -3186,10 +3285,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
                ret = btrfs_discard_extent(root, start, end + 1 - start);
 
-               /* unlocks the pinned mutex */
-               btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
                clear_extent_dirty(unpin, start, end, GFP_NOFS);
-
+               unpin_extent_range(root, start, end);
                cond_resched();
        }
 
@@ -3199,7 +3296,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 static int pin_down_bytes(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          struct btrfs_path *path,
-                         u64 bytenr, u64 num_bytes, int is_data,
+                         u64 bytenr, u64 num_bytes,
+                         int is_data, int reserved,
                          struct extent_buffer **must_clean)
 {
        int err = 0;
@@ -3231,15 +3329,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
        }
        free_extent_buffer(buf);
 pinit:
-       btrfs_set_path_blocking(path);
+       if (path)
+               btrfs_set_path_blocking(path);
        /* unlocks the pinned mutex */
-       btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+       btrfs_pin_extent(root, bytenr, num_bytes, reserved);
 
        BUG_ON(err < 0);
        return 0;
 }
 
-
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 parent,
@@ -3413,7 +3511,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                }
 
                ret = pin_down_bytes(trans, root, path, bytenr,
-                                    num_bytes, is_data, &must_clean);
+                                    num_bytes, is_data, 0, &must_clean);
                if (ret > 0)
                        mark_free = 1;
                BUG_ON(ret < 0);
@@ -3544,8 +3642,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
        if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
                WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
                /* unlocks the pinned mutex */
-               btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
-               update_reserved_extents(root, bytenr, num_bytes, 0);
+               btrfs_pin_extent(root, bytenr, num_bytes, 1);
                ret = 0;
        } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
                ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
@@ -3585,19 +3682,33 @@ static noinline int
 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
                                u64 num_bytes)
 {
+       struct btrfs_caching_control *caching_ctl;
        DEFINE_WAIT(wait);
 
-       prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
-
-       if (block_group_cache_done(cache)) {
-               finish_wait(&cache->caching_q, &wait);
+       caching_ctl = get_caching_control(cache);
+       if (!caching_ctl)
                return 0;
-       }
-       schedule();
-       finish_wait(&cache->caching_q, &wait);
 
-       wait_event(cache->caching_q, block_group_cache_done(cache) ||
+       wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
                   (cache->free_space >= num_bytes));
+
+       put_caching_control(caching_ctl);
+       return 0;
+}
+
+static noinline int
+wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+       struct btrfs_caching_control *caching_ctl;
+       DEFINE_WAIT(wait);
+
+       caching_ctl = get_caching_control(cache);
+       if (!caching_ctl)
+               return 0;
+
+       wait_event(caching_ctl->wait, block_group_cache_done(cache));
+
+       put_caching_control(caching_ctl);
        return 0;
 }
 
@@ -3635,6 +3746,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        int last_ptr_loop = 0;
        int loop = 0;
        bool found_uncached_bg = false;
+       bool failed_cluster_refill = false;
 
        WARN_ON(num_bytes < root->sectorsize);
        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3732,7 +3844,16 @@ have_block_group:
                if (unlikely(block_group->ro))
                        goto loop;
 
-               if (last_ptr) {
+               /*
+                * Ok we want to try and use the cluster allocator, so lets look
+                * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
+                * have tried the cluster allocator plenty of times at this
+                * point and not have found anything, so we are likely way too
+                * fragmented for the clustering stuff to find anything, so lets
+                * just skip it and let the allocator find whatever block it can
+                * find
+                */
+               if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
                        /*
                         * the refill lock keeps out other
                         * people trying to start a new cluster
@@ -3807,9 +3928,11 @@ refill_cluster:
                                        spin_unlock(&last_ptr->refill_lock);
                                        goto checks;
                                }
-                       } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
+                       } else if (!cached && loop > LOOP_CACHING_NOWAIT
+                                  && !failed_cluster_refill) {
                                spin_unlock(&last_ptr->refill_lock);
 
+                               failed_cluster_refill = true;
                                wait_block_group_cache_progress(block_group,
                                       num_bytes + empty_cluster + empty_size);
                                goto have_block_group;
@@ -3821,13 +3944,9 @@ refill_cluster:
                         * cluster.  Free the cluster we've been trying
                         * to use, and go to the next block group
                         */
-                       if (loop < LOOP_NO_EMPTY_SIZE) {
-                               btrfs_return_cluster_to_free_space(NULL,
-                                                                  last_ptr);
-                               spin_unlock(&last_ptr->refill_lock);
-                               goto loop;
-                       }
+                       btrfs_return_cluster_to_free_space(NULL, last_ptr);
                        spin_unlock(&last_ptr->refill_lock);
+                       goto loop;
                }
 
                offset = btrfs_find_space_for_alloc(block_group, search_start,
@@ -3881,9 +4000,12 @@ checks:
                                             search_start - offset);
                BUG_ON(offset > search_start);
 
+               update_reserved_extents(block_group, num_bytes, 1);
+
                /* we are all good, lets return */
                break;
 loop:
+               failed_cluster_refill = false;
                btrfs_put_block_group(block_group);
        }
        up_read(&space_info->groups_sem);
@@ -3973,12 +4095,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
        up_read(&info->groups_sem);
 }
 
-static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *root,
-                                 u64 num_bytes, u64 min_alloc_size,
-                                 u64 empty_size, u64 hint_byte,
-                                 u64 search_end, struct btrfs_key *ins,
-                                 u64 data)
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        u64 num_bytes, u64 min_alloc_size,
+                        u64 empty_size, u64 hint_byte,
+                        u64 search_end, struct btrfs_key *ins,
+                        u64 data)
 {
        int ret;
        u64 search_start = 0;
@@ -4044,25 +4166,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
        ret = btrfs_discard_extent(root, start, len);
 
        btrfs_add_free_space(cache, start, len);
+       update_reserved_extents(cache, len, 0);
        btrfs_put_block_group(cache);
-       update_reserved_extents(root, start, len, 0);
-
-       return ret;
-}
-
-int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *root,
-                                 u64 num_bytes, u64 min_alloc_size,
-                                 u64 empty_size, u64 hint_byte,
-                                 u64 search_end, struct btrfs_key *ins,
-                                 u64 data)
-{
-       int ret;
-       ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
-                                    empty_size, hint_byte, search_end, ins,
-                                    data);
-       if (!ret)
-               update_reserved_extents(root, ins->objectid, ins->offset, 1);
 
        return ret;
 }
@@ -4223,15 +4328,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 {
        int ret;
        struct btrfs_block_group_cache *block_group;
+       struct btrfs_caching_control *caching_ctl;
+       u64 start = ins->objectid;
+       u64 num_bytes = ins->offset;
 
        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
        cache_block_group(block_group);
-       wait_event(block_group->caching_q,
-                  block_group_cache_done(block_group));
+       caching_ctl = get_caching_control(block_group);
 
-       ret = btrfs_remove_free_space(block_group, ins->objectid,
-                                     ins->offset);
-       BUG_ON(ret);
+       if (!caching_ctl) {
+               BUG_ON(!block_group_cache_done(block_group));
+               ret = btrfs_remove_free_space(block_group, start, num_bytes);
+               BUG_ON(ret);
+       } else {
+               mutex_lock(&caching_ctl->mutex);
+
+               if (start >= caching_ctl->progress) {
+                       ret = add_excluded_extent(root, start, num_bytes);
+                       BUG_ON(ret);
+               } else if (start + num_bytes <= caching_ctl->progress) {
+                       ret = btrfs_remove_free_space(block_group,
+                                                     start, num_bytes);
+                       BUG_ON(ret);
+               } else {
+                       num_bytes = caching_ctl->progress - start;
+                       ret = btrfs_remove_free_space(block_group,
+                                                     start, num_bytes);
+                       BUG_ON(ret);
+
+                       start = caching_ctl->progress;
+                       num_bytes = ins->objectid + ins->offset -
+                                   caching_ctl->progress;
+                       ret = add_excluded_extent(root, start, num_bytes);
+                       BUG_ON(ret);
+               }
+
+               mutex_unlock(&caching_ctl->mutex);
+               put_caching_control(caching_ctl);
+       }
+
+       update_reserved_extents(block_group, ins->offset, 1);
        btrfs_put_block_group(block_group);
        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
                                         0, owner, offset, ins, 1);
@@ -4255,9 +4391,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
        int ret;
        u64 flags = 0;
 
-       ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
-                                    empty_size, hint_byte, search_end,
-                                    ins, 0);
+       ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
+                                  empty_size, hint_byte, search_end,
+                                  ins, 0);
        if (ret)
                return ret;
 
@@ -4268,7 +4404,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
        } else
                BUG_ON(parent > 0);
 
-       update_reserved_extents(root, ins->objectid, ins->offset, 1);
        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
                struct btrfs_delayed_extent_op *extent_op;
                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
@@ -4347,452 +4482,99 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
        return buf;
 }
 
-#if 0
-int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root, struct extent_buffer *leaf)
+struct walk_control {
+       u64 refs[BTRFS_MAX_LEVEL];
+       u64 flags[BTRFS_MAX_LEVEL];
+       struct btrfs_key update_progress;
+       int stage;
+       int level;
+       int shared_level;
+       int update_ref;
+       int keep_locks;
+       int reada_slot;
+       int reada_count;
+};
+
+#define DROP_REFERENCE 1
+#define UPDATE_BACKREF 2
+
+static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct walk_control *wc,
+                                    struct btrfs_path *path)
 {
-       u64 disk_bytenr;
-       u64 num_bytes;
-       struct btrfs_key key;
-       struct btrfs_file_extent_item *fi;
+       u64 bytenr;
+       u64 generation;
+       u64 refs;
+       u64 last = 0;
        u32 nritems;
-       int i;
+       u32 blocksize;
+       struct btrfs_key key;
+       struct extent_buffer *eb;
        int ret;
+       int slot;
+       int nread = 0;
 
-       BUG_ON(!btrfs_is_leaf(leaf));
-       nritems = btrfs_header_nritems(leaf);
+       if (path->slots[wc->level] < wc->reada_slot) {
+               wc->reada_count = wc->reada_count * 2 / 3;
+               wc->reada_count = max(wc->reada_count, 2);
+       } else {
+               wc->reada_count = wc->reada_count * 3 / 2;
+               wc->reada_count = min_t(int, wc->reada_count,
+                                       BTRFS_NODEPTRS_PER_BLOCK(root));
+       }
 
-       for (i = 0; i < nritems; i++) {
-               cond_resched();
-               btrfs_item_key_to_cpu(leaf, &key, i);
+       eb = path->nodes[wc->level];
+       nritems = btrfs_header_nritems(eb);
+       blocksize = btrfs_level_size(root, wc->level - 1);
 
-               /* only extents have references, skip everything else */
-               if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-                       continue;
-
-               fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-
-               /* inline extents live in the btree, they don't have refs */
-               if (btrfs_file_extent_type(leaf, fi) ==
-                   BTRFS_FILE_EXTENT_INLINE)
-                       continue;
-
-               disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-
-               /* holes don't have refs */
-               if (disk_bytenr == 0)
-                       continue;
-
-               num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
-               ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
-                                       leaf->start, 0, key.objectid, 0);
-               BUG_ON(ret);
-       }
-       return 0;
-}
-
-static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
-                                       struct btrfs_root *root,
-                                       struct btrfs_leaf_ref *ref)
-{
-       int i;
-       int ret;
-       struct btrfs_extent_info *info;
-       struct refsort *sorted;
-
-       if (ref->nritems == 0)
-               return 0;
-
-       sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
-       for (i = 0; i < ref->nritems; i++) {
-               sorted[i].bytenr = ref->extents[i].bytenr;
-               sorted[i].slot = i;
-       }
-       sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
+       for (slot = path->slots[wc->level]; slot < nritems; slot++) {
+               if (nread >= wc->reada_count)
+                       break;
 
-       /*
-        * the items in the ref were sorted when the ref was inserted
-        * into the ref cache, so this is already in order
-        */
-       for (i = 0; i < ref->nritems; i++) {
-               info = ref->extents + sorted[i].slot;
-               ret = btrfs_free_extent(trans, root, info->bytenr,
-                                         info->num_bytes, ref->bytenr,
-                                         ref->owner, ref->generation,
-                                         info->objectid, 0);
-
-               atomic_inc(&root->fs_info->throttle_gen);
-               wake_up(&root->fs_info->transaction_throttle);
                cond_resched();
+               bytenr = btrfs_node_blockptr(eb, slot);
+               generation = btrfs_node_ptr_generation(eb, slot);
 
-               BUG_ON(ret);
-               info++;
-       }
-
-       kfree(sorted);
-       return 0;
-}
-
-
-static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *root, u64 start,
-                                    u64 len, u32 *refs)
-{
-       int ret;
-
-       ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
-       BUG_ON(ret);
-
-#if 0 /* some debugging code in case we see problems here */
-       /* if the refs count is one, it won't get increased again.  But
-        * if the ref count is > 1, someone may be decreasing it at
-        * the same time we are.
-        */
-       if (*refs != 1) {
-               struct extent_buffer *eb = NULL;
-               eb = btrfs_find_create_tree_block(root, start, len);
-               if (eb)
-                       btrfs_tree_lock(eb);
-
-               mutex_lock(&root->fs_info->alloc_mutex);
-               ret = lookup_extent_ref(NULL, root, start, len, refs);
-               BUG_ON(ret);
-               mutex_unlock(&root->fs_info->alloc_mutex);
-
-               if (eb) {
-                       btrfs_tree_unlock(eb);
-                       free_extent_buffer(eb);
-               }
-               if (*refs == 1) {
-                       printk(KERN_ERR "btrfs block %llu went down to one "
-                              "during drop_snap\n", (unsigned long long)start);
-               }
-
-       }
-#endif
-
-       cond_resched();
-       return ret;
-}
-
+               if (slot == path->slots[wc->level])
+                       goto reada;
 
-/*
- * this is used while deleting old snapshots, and it drops the refs
- * on a whole subtree starting from a level 1 node.
- *
- * The idea is to sort all the leaf pointers, and then drop the
- * ref on all the leaves in order.  Most of the time the leaves
- * will have ref cache entries, so no leaf IOs will be required to
- * find the extents they have references on.
- *
- * For each leaf, any references it has are also dropped in order
- *
- * This ends up dropping the references in something close to optimal
- * order for reading and modifying the extent allocation tree.
- */
-static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
-                                       struct btrfs_root *root,
-                                       struct btrfs_path *path)
-{
-       u64 bytenr;
-       u64 root_owner;
-       u64 root_gen;
-       struct extent_buffer *eb = path->nodes[1];
-       struct extent_buffer *leaf;
-       struct btrfs_leaf_ref *ref;
-       struct refsort *sorted = NULL;
-       int nritems = btrfs_header_nritems(eb);
-       int ret;
-       int i;
-       int refi = 0;
-       int slot = path->slots[1];
-       u32 blocksize = btrfs_level_size(root, 0);
-       u32 refs;
-
-       if (nritems == 0)
-               goto out;
-
-       root_owner = btrfs_header_owner(eb);
-       root_gen = btrfs_header_generation(eb);
-       sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
-
-       /*
-        * step one, sort all the leaf pointers so we don't scribble
-        * randomly into the extent allocation tree
-        */
-       for (i = slot; i < nritems; i++) {
-               sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
-               sorted[refi].slot = i;
-               refi++;
-       }
-
-       /*
-        * nritems won't be zero, but if we're picking up drop_snapshot
-        * after a crash, slot might be > 0, so double check things
-        * just in case.
-        */
-       if (refi == 0)
-               goto out;
-
-       sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
-
-       /*
-        * the first loop frees everything the leaves point to
-        */
-       for (i = 0; i < refi; i++) {
-               u64 ptr_gen;
-
-               bytenr = sorted[i].bytenr;
-
-               /*
-                * check the reference count on this leaf.  If it is > 1
-                * we just decrement it below and don't update any
-                * of the refs the leaf points to.
-                */
-               ret = drop_snap_lookup_refcount(trans, root, bytenr,
-                                               blocksize, &refs);
-               BUG_ON(ret);
-               if (refs != 1)
+               if (wc->stage == UPDATE_BACKREF &&
+                   generation <= root->root_key.offset)
                        continue;
 
-               ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
-
-               /*
-                * the leaf only had one reference, which means the
-                * only thing pointing to this leaf is the snapshot
-                * we're deleting.  It isn't possible for the reference
-                * count to increase again later
-                *
-                * The reference cache is checked for the leaf,
-                * and if found we'll be able to drop any refs held by
-                * the leaf without needing to read it in.
-                */
-               ref = btrfs_lookup_leaf_ref(root, bytenr);
-               if (ref && ref->generation != ptr_gen) {
-                       btrfs_free_leaf_ref(root, ref);
-                       ref = NULL;
-               }
-               if (ref) {
-                       ret = cache_drop_leaf_ref(trans, root, ref);
-                       BUG_ON(ret);
-                       btrfs_remove_leaf_ref(root, ref);
-                       btrfs_free_leaf_ref(root, ref);
-               } else {
-                       /*
-                        * the leaf wasn't in the reference cache, so
-                        * we have to read it.
-                        */
-                       leaf = read_tree_block(root, bytenr, blocksize,
-                                              ptr_gen);
-                       ret = btrfs_drop_leaf_ref(trans, root, leaf);
+               if (wc->stage == DROP_REFERENCE) {
+                       ret = btrfs_lookup_extent_info(trans, root,
+                                               bytenr, blocksize,
+                                               &refs, NULL);
                        BUG_ON(ret);
-                       free_extent_buffer(leaf);
-               }
-               atomic_inc(&root->fs_info->throttle_gen);
-               wake_up(&root->fs_info->transaction_throttle);
-               cond_resched();
-       }
-
-       /*
-        * run through the loop again to free the refs on the leaves.
-        * This is faster than doing it in the loop above because
-        * the leaves are likely to be clustered together.  We end up
-        * working in nice chunks on the extent allocation tree.
-        */
-       for (i = 0; i < refi; i++) {
-               bytenr = sorted[i].bytenr;
-               ret = btrfs_free_extent(trans, root, bytenr,
-                                       blocksize, eb->start,
-                                       root_owner, root_gen, 0, 1);
-               BUG_ON(ret);
+                       BUG_ON(refs == 0);
+                       if (refs == 1)
+                               goto reada;
 
-               atomic_inc(&root->fs_info->throttle_gen);
-               wake_up(&root->fs_info->transaction_throttle);
-               cond_resched();
-       }
-out:
-       kfree(sorted);
-
-       /*
-        * update the path to show we've processed the entire level 1
-        * node.  This will get saved into the root's drop_snapshot_progress
-        * field so these drops are not repeated again if this transaction
-        * commits.
-        */
-       path->slots[1] = nritems;
-       return 0;
-}
-
-/*
- * helper function for drop_snapshot, this walks down the tree dropping ref
- * counts as it goes.
- */
-static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
-                                  struct btrfs_root *root,
-                                  struct btrfs_path *path, int *level)
-{
-       u64 root_owner;
-       u64 root_gen;
-       u64 bytenr;
-       u64 ptr_gen;
-       struct extent_buffer *next;
-       struct extent_buffer *cur;
-       struct extent_buffer *parent;
-       u32 blocksize;
-       int ret;
-       u32 refs;
-
-       WARN_ON(*level < 0);
-       WARN_ON(*level >= BTRFS_MAX_LEVEL);
-       ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
-                               path->nodes[*level]->len, &refs);
-       BUG_ON(ret);
-       if (refs > 1)
-               goto out;
-
-       /*
-        * walk down to the last node level and free all the leaves
-        */
-       while (*level >= 0) {
-               WARN_ON(*level < 0);
-               WARN_ON(*level >= BTRFS_MAX_LEVEL);
-               cur = path->nodes[*level];
-
-               if (btrfs_header_level(cur) != *level)
-                       WARN_ON(1);
-
-               if (path->slots[*level] >=
-                   btrfs_header_nritems(cur))
-                       break;
-
-               /* the new code goes down to level 1 and does all the
-                * leaves pointed to that node in bulk.  So, this check
-                * for level 0 will always be false.
-                *
-                * But, the disk format allows the drop_snapshot_progress
-                * field in the root to leave things in a state where
-                * a leaf will need cleaning up here.  If someone crashes
-                * with the old code and then boots with the new code,
-                * we might find a leaf here.
-                */
-               if (*level == 0) {
-                       ret = btrfs_drop_leaf_ref(trans, root, cur);
-                       BUG_ON(ret);
-                       break;
+                       if (!wc->update_ref ||
+                           generation <= root->root_key.offset)
+                               continue;
+                       btrfs_node_key_to_cpu(eb, &key, slot);
+                       ret = btrfs_comp_cpu_keys(&key,
+                                                 &wc->update_progress);
+                       if (ret < 0)
+                               continue;
                }
-
-               /*
-                * once we get to level one, process the whole node
-                * at once, including everything below it.
-                */
-               if (*level == 1) {
-                       ret = drop_level_one_refs(trans, root, path);
-                       BUG_ON(ret);
+reada:
+               ret = readahead_tree_block(root, bytenr, blocksize,
+                                          generation);
+               if (ret)
                        break;
-               }
-
-               bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
-               ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
-               blocksize = btrfs_level_size(root, *level - 1);
-
-               ret = drop_snap_lookup_refcount(trans, root, bytenr,
-                                               blocksize, &refs);
-               BUG_ON(ret);
-
-               /*
-                * if there is more than one reference, we don't need
-                * to read that node to drop any references it has.  We
-                * just drop the ref we hold on that node and move on to the
-                * next slot in this level.
-                */
-               if (refs != 1) {
-                       parent = path->nodes[*level];
-                       root_owner = btrfs_header_owner(parent);
-                       root_gen = btrfs_header_generation(parent);
-                       path->slots[*level]++;
-
-                       ret = btrfs_free_extent(trans, root, bytenr,
-                                               blocksize, parent->start,
-                                               root_owner, root_gen,
-                                               *level - 1, 1);
-                       BUG_ON(ret);
-
-                       atomic_inc(&root->fs_info->throttle_gen);
-                       wake_up(&root->fs_info->transaction_throttle);
-                       cond_resched();
-
-                       continue;
-               }
-
-               /*
-                * we need to keep freeing things in the next level down.
-                * read the block and loop around to process it
-                */
-               next = read_tree_block(root, bytenr, blocksize, ptr_gen);
-               WARN_ON(*level <= 0);
-               if (path->nodes[*level-1])
-                       free_extent_buffer(path->nodes[*level-1]);
-               path->nodes[*level-1] = next;
-               *level = btrfs_header_level(next);
-               path->slots[*level] = 0;
-               cond_resched();
+               last = bytenr + blocksize;
+               nread++;
        }
-out:
-       WARN_ON(*level < 0);
-       WARN_ON(*level >= BTRFS_MAX_LEVEL);
-
-       if (path->nodes[*level] == root->node) {
-               parent = path->nodes[*level];
-               bytenr = path->nodes[*level]->start;
-       } else {
-               parent = path->nodes[*level + 1];
-               bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
-       }
-
-       blocksize = btrfs_level_size(root, *level);
-       root_owner = btrfs_header_owner(parent);
-       root_gen = btrfs_header_generation(parent);
-
-       /*
-        * cleanup and free the reference on the last node
-        * we processed
-        */
-       ret = btrfs_free_extent(trans, root, bytenr, blocksize,
-                                 parent->start, root_owner, root_gen,
-                                 *level, 1);
-       free_extent_buffer(path->nodes[*level]);
-       path->nodes[*level] = NULL;
-
-       *level += 1;
-       BUG_ON(ret);
-
-       cond_resched();
-       return 0;
+       wc->reada_slot = slot;
 }
-#endif
-
-struct walk_control {
-       u64 refs[BTRFS_MAX_LEVEL];
-       u64 flags[BTRFS_MAX_LEVEL];
-       struct btrfs_key update_progress;
-       int stage;
-       int level;
-       int shared_level;
-       int update_ref;
-       int keep_locks;
-};
-
-#define DROP_REFERENCE 1
-#define UPDATE_BACKREF 2
 
 /*
  * hepler to process tree block while walking down the tree.
  *
- * when wc->stage == DROP_REFERENCE, this function checks
- * reference count of the block. if the block is shared and
- * we need update back refs for the subtree rooted at the
- * block, this function changes wc->stage to UPDATE_BACKREF
- *
  * when wc->stage == UPDATE_BACKREF, this function updates
  * back refs for pointers in the block.
  *
@@ -4805,7 +4587,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 {
        int level = wc->level;
        struct extent_buffer *eb = path->nodes[level];
-       struct btrfs_key key;
        u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
        int ret;
 
@@ -4828,21 +4609,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
                BUG_ON(wc->refs[level] == 0);
        }
 
-       if (wc->stage == DROP_REFERENCE &&
-           wc->update_ref && wc->refs[level] > 1) {
-               BUG_ON(eb == root->node);
-               BUG_ON(path->slots[level] > 0);
-               if (level == 0)
-                       btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
-               else
-                       btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
-               if (btrfs_header_owner(eb) == root->root_key.objectid &&
-                   btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
-                       wc->stage = UPDATE_BACKREF;
-                       wc->shared_level = level;
-               }
-       }
-
        if (wc->stage == DROP_REFERENCE) {
                if (wc->refs[level] > 1)
                        return 1;
@@ -4878,6 +4644,123 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+/*
+ * hepler to process tree block pointer.
+ *
+ * when wc->stage == DROP_REFERENCE, this function checks
+ * reference count of the block pointed to. if the block
+ * is shared and we need update back refs for the subtree
+ * rooted at the block, this function changes wc->stage to
+ * UPDATE_BACKREF. if the block is shared and there is no
+ * need to update back, this function drops the reference
+ * to the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
+ */
+static noinline int do_walk_down(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_path *path,
+                                struct walk_control *wc)
+{
+       u64 bytenr;
+       u64 generation;
+       u64 parent;
+       u32 blocksize;
+       struct btrfs_key key;
+       struct extent_buffer *next;
+       int level = wc->level;
+       int reada = 0;
+       int ret = 0;
+
+       generation = btrfs_node_ptr_generation(path->nodes[level],
+                                              path->slots[level]);
+       /*
+        * if the lower level block was created before the snapshot
+        * was created, we know there is no need to update back refs
+        * for the subtree
+        */
+       if (wc->stage == UPDATE_BACKREF &&
+           generation <= root->root_key.offset)
+               return 1;
+
+       bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+       blocksize = btrfs_level_size(root, level - 1);
+
+       next = btrfs_find_tree_block(root, bytenr, blocksize);
+       if (!next) {
+               next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+               reada = 1;
+       }
+       btrfs_tree_lock(next);
+       btrfs_set_lock_blocking(next);
+
+       if (wc->stage == DROP_REFERENCE) {
+               ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+                                              &wc->refs[level - 1],
+                                              &wc->flags[level - 1]);
+               BUG_ON(ret);
+               BUG_ON(wc->refs[level - 1] == 0);
+
+               if (wc->refs[level - 1] > 1) {
+                       if (!wc->update_ref ||
+                           generation <= root->root_key.offset)
+                               goto skip;
+
+                       btrfs_node_key_to_cpu(path->nodes[level], &key,
+                                             path->slots[level]);
+                       ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
+                       if (ret < 0)
+                               goto skip;
+
+                       wc->stage = UPDATE_BACKREF;
+                       wc->shared_level = level - 1;
+               }
+       }
+
+       if (!btrfs_buffer_uptodate(next, generation)) {
+               btrfs_tree_unlock(next);
+               free_extent_buffer(next);
+               next = NULL;
+       }
+
+       if (!next) {
+               if (reada && level == 1)
+                       reada_walk_down(trans, root, wc, path);
+               next = read_tree_block(root, bytenr, blocksize, generation);
+               btrfs_tree_lock(next);
+               btrfs_set_lock_blocking(next);
+       }
+
+       level--;
+       BUG_ON(level != btrfs_header_level(next));
+       path->nodes[level] = next;
+       path->slots[level] = 0;
+       path->locks[level] = 1;
+       wc->level = level;
+       if (wc->level == 1)
+               wc->reada_slot = 0;
+       return 0;
+skip:
+       wc->refs[level - 1] = 0;
+       wc->flags[level - 1] = 0;
+
+       if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+               parent = path->nodes[level]->start;
+       } else {
+               BUG_ON(root->root_key.objectid !=
+                      btrfs_header_owner(path->nodes[level]));
+               parent = 0;
+       }
+
+       ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
+                               root->root_key.objectid, level - 1, 0);
+       BUG_ON(ret);
+
+       btrfs_tree_unlock(next);
+       free_extent_buffer(next);
+       return 1;
+}
+
 /*
  * hepler to process tree block while walking up the tree.
  *
@@ -4905,7 +4788,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                if (level < wc->shared_level)
                        goto out;
 
-               BUG_ON(wc->refs[level] <= 1);
                ret = find_next_key(path, level + 1, &wc->update_progress);
                if (ret > 0)
                        wc->update_ref = 0;
@@ -4936,8 +4818,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                                path->locks[level] = 0;
                                return 1;
                        }
-               } else {
-                       BUG_ON(level != 0);
                }
        }
 
@@ -4990,17 +4870,13 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
                                   struct btrfs_path *path,
                                   struct walk_control *wc)
 {
-       struct extent_buffer *next;
-       struct extent_buffer *cur;
-       u64 bytenr;
-       u64 ptr_gen;
-       u32 blocksize;
        int level = wc->level;
        int ret;
 
        while (level >= 0) {
-               cur = path->nodes[level];
-               BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
+               if (path->slots[level] >=
+                   btrfs_header_nritems(path->nodes[level]))
+                       break;
 
                ret = walk_down_proc(trans, root, path, wc);
                if (ret > 0)
@@ -5009,20 +4885,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
                if (level == 0)
                        break;
 
-               bytenr = btrfs_node_blockptr(cur, path->slots[level]);
-               blocksize = btrfs_level_size(root, level - 1);
-               ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
-
-               next = read_tree_block(root, bytenr, blocksize, ptr_gen);
-               btrfs_tree_lock(next);
-               btrfs_set_lock_blocking(next);
-
-               level--;
-               BUG_ON(level != btrfs_header_level(next));
-               path->nodes[level] = next;
-               path->slots[level] = 0;
-               path->locks[level] = 1;
-               wc->level = level;
+               ret = do_walk_down(trans, root, path, wc);
+               if (ret > 0) {
+                       path->slots[level]++;
+                       continue;
+               }
+               level = wc->level;
        }
        return 0;
 }
@@ -5112,9 +4980,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
                        err = ret;
                        goto out;
                }
-               btrfs_node_key_to_cpu(path->nodes[level], &key,
-                                     path->slots[level]);
-               WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
+               WARN_ON(ret > 0);
 
                /*
                 * unlock our path, this is safe because only this
@@ -5149,6 +5015,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
        wc->stage = DROP_REFERENCE;
        wc->update_ref = update_ref;
        wc->keep_locks = 0;
+       wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
        while (1) {
                ret = walk_down_tree(trans, root, path, wc);
@@ -5201,9 +5068,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
        ret = btrfs_del_root(trans, tree_root, &root->root_key);
        BUG_ON(ret);
 
-       free_extent_buffer(root->node);
-       free_extent_buffer(root->commit_root);
-       kfree(root);
+       if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+               ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
+                                          NULL, NULL);
+               BUG_ON(ret < 0);
+               if (ret > 0) {
+                       ret = btrfs_del_orphan_item(trans, tree_root,
+                                                   root->root_key.objectid);
+                       BUG_ON(ret);
+               }
+       }
+
+       if (root->in_radix) {
+               btrfs_free_fs_root(tree_root->fs_info, root);
+       } else {
+               free_extent_buffer(root->node);
+               free_extent_buffer(root->commit_root);
+               kfree(root);
+       }
 out:
        btrfs_end_transaction(trans, tree_root);
        kfree(wc);
@@ -5255,6 +5137,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        wc->stage = DROP_REFERENCE;
        wc->update_ref = 0;
        wc->keep_locks = 1;
+       wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
        while (1) {
                wret = walk_down_tree(trans, root, path, wc);
@@ -5397,9 +5280,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
        lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
        while (1) {
                int ret;
-               spin_lock(&em_tree->lock);
+               write_lock(&em_tree->lock);
                ret = add_extent_mapping(em_tree, em);
-               spin_unlock(&em_tree->lock);
+               write_unlock(&em_tree->lock);
                if (ret != -EEXIST) {
                        free_extent_map(em);
                        break;
@@ -6842,287 +6725,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
        return 0;
 }
 
-#if 0
-static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root,
-                                u64 objectid, u64 size)
-{
-       struct btrfs_path *path;
-       struct btrfs_inode_item *item;
-       struct extent_buffer *leaf;
-       int ret;
-
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-
-       path->leave_spinning = 1;
-       ret = btrfs_insert_empty_inode(trans, root, path, objectid);
-       if (ret)
-               goto out;
-
-       leaf = path->nodes[0];
-       item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
-       memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
-       btrfs_set_inode_generation(leaf, item, 1);
-       btrfs_set_inode_size(leaf, item, size);
-       btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
-       btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
-       btrfs_mark_buffer_dirty(leaf);
-       btrfs_release_path(root, path);
-out:
-       btrfs_free_path(path);
-       return ret;
-}
-
-static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
-                                       struct btrfs_block_group_cache *group)
+/*
+ * checks to see if its even possible to relocate this block group.
+ *
+ * @return - -1 if it's not a good idea to relocate this block group, 0 if its
+ * ok to go ahead and try.
+ */
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 {
-       struct inode *inode = NULL;
-       struct btrfs_trans_handle *trans;
-       struct btrfs_root *root;
-       struct btrfs_key root_key;
-       u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
-       int err = 0;
+       struct btrfs_block_group_cache *block_group;
+       struct btrfs_space_info *space_info;
+       struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+       struct btrfs_device *device;
+       int full = 0;
+       int ret = 0;
 
-       root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
-       root_key.type = BTRFS_ROOT_ITEM_KEY;
-       root_key.offset = (u64)-1;
-       root = btrfs_read_fs_root_no_name(fs_info, &root_key);
-       if (IS_ERR(root))
-               return ERR_CAST(root);
+       block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
 
-       trans = btrfs_start_transaction(root, 1);
-       BUG_ON(!trans);
+       /* odd, couldn't find the block group, leave it alone */
+       if (!block_group)
+               return -1;
 
-       err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
-       if (err)
+       /* no bytes used, we're good */
+       if (!btrfs_block_group_used(&block_group->item))
                goto out;
 
-       err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
-       BUG_ON(err);
-
-       err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
-                                      group->key.offset, 0, group->key.offset,
-                                      0, 0, 0);
-       BUG_ON(err);
-
-       inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
-       if (inode->i_state & I_NEW) {
-               BTRFS_I(inode)->root = root;
-               BTRFS_I(inode)->location.objectid = objectid;
-               BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
-               BTRFS_I(inode)->location.offset = 0;
-               btrfs_read_locked_inode(inode);
-               unlock_new_inode(inode);
-               BUG_ON(is_bad_inode(inode));
-       } else {
-               BUG_ON(1);
-       }
-       BTRFS_I(inode)->index_cnt = group->key.objectid;
-
-       err = btrfs_orphan_add(trans, inode);
-out:
-       btrfs_end_transaction(trans, root);
-       if (err) {
-               if (inode)
-                       iput(inode);
-               inode = ERR_PTR(err);
-       }
-       return inode;
-}
-
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
-{
-
-       struct btrfs_ordered_sum *sums;
-       struct btrfs_sector_sum *sector_sum;
-       struct btrfs_ordered_extent *ordered;
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct list_head list;
-       size_t offset;
-       int ret;
-       u64 disk_bytenr;
-
-       INIT_LIST_HEAD(&list);
-
-       ordered = btrfs_lookup_ordered_extent(inode, file_pos);
-       BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
-
-       disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
-       ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
-                                      disk_bytenr + len - 1, &list);
-
-       while (!list_empty(&list)) {
-               sums = list_entry(list.next, struct btrfs_ordered_sum, list);
-               list_del_init(&sums->list);
-
-               sector_sum = sums->sums;
-               sums->bytenr = ordered->start;
+       space_info = block_group->space_info;
+       spin_lock(&space_info->lock);
 
-               offset = 0;
-               while (offset < sums->len) {
-                       sector_sum->bytenr += ordered->start - disk_bytenr;
-                       sector_sum++;
-                       offset += root->sectorsize;
-               }
+       full = space_info->full;
 
-               btrfs_add_ordered_sum(inode, ordered, sums);
+       /*
+        * if this is the last block group we have in this space, we can't
+        * relocate it unless we're able to allocate a new chunk below.
+        *
+        * Otherwise, we need to make sure we have room in the space to handle
+        * all of the extents from this block group.  If we can, we're good
+        */
+       if ((space_info->total_bytes != block_group->key.offset) &&
+          (space_info->bytes_used + space_info->bytes_reserved +
+           space_info->bytes_pinned + space_info->bytes_readonly +
+           btrfs_block_group_used(&block_group->item) <
+           space_info->total_bytes)) {
+               spin_unlock(&space_info->lock);
+               goto out;
        }
-       btrfs_put_ordered_extent(ordered);
-       return 0;
-}
-
-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
-{
-       struct btrfs_trans_handle *trans;
-       struct btrfs_path *path;
-       struct btrfs_fs_info *info = root->fs_info;
-       struct extent_buffer *leaf;
-       struct inode *reloc_inode;
-       struct btrfs_block_group_cache *block_group;
-       struct btrfs_key key;
-       u64 skipped;
-       u64 cur_byte;
-       u64 total_found;
-       u32 nritems;
-       int ret;
-       int progress;
-       int pass = 0;
-
-       root = root->fs_info->extent_root;
-
-       block_group = btrfs_lookup_block_group(info, group_start);
-       BUG_ON(!block_group);
-
-       printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
-              (unsigned long long)block_group->key.objectid,
-              (unsigned long long)block_group->flags);
-
-       path = btrfs_alloc_path();
-       BUG_ON(!path);
-
-       reloc_inode = create_reloc_inode(info, block_group);
-       BUG_ON(IS_ERR(reloc_inode));
-
-       __alloc_chunk_for_shrink(root, block_group, 1);
-       set_block_group_readonly(block_group);
-
-       btrfs_start_delalloc_inodes(info->tree_root);
-       btrfs_wait_ordered_extents(info->tree_root, 0);
-again:
-       skipped = 0;
-       total_found = 0;
-       progress = 0;
-       key.objectid = block_group->key.objectid;
-       key.offset = 0;
-       key.type = 0;
-       cur_byte = key.objectid;
-
-       trans = btrfs_start_transaction(info->tree_root, 1);
-       btrfs_commit_transaction(trans, info->tree_root);
+       spin_unlock(&space_info->lock);
 
-       mutex_lock(&root->fs_info->cleaner_mutex);
-       btrfs_clean_old_snapshots(info->tree_root);
-       btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
-       mutex_unlock(&root->fs_info->cleaner_mutex);
+       /*
+        * ok we don't have enough space, but maybe we have free space on our
+        * devices to allocate new chunks for relocation, so loop through our
+        * alloc devices and guess if we have enough space.  However, if we
+        * were marked as full, then we know there aren't enough chunks, and we
+        * can just return.
+        */
+       ret = -1;
+       if (full)
+               goto out;
 
-       trans = btrfs_start_transaction(info->tree_root, 1);
-       btrfs_commit_transaction(trans, info->tree_root);
+       mutex_lock(&root->fs_info->chunk_mutex);
+       list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+               u64 min_free = btrfs_block_group_used(&block_group->item);
+               u64 dev_offset, max_avail;
 
-       while (1) {
-               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-               if (ret < 0)
-                       goto out;
-next:
-               leaf = path->nodes[0];
-               nritems = btrfs_header_nritems(leaf);
-               if (path->slots[0] >= nritems) {
-                       ret = btrfs_next_leaf(root, path);
-                       if (ret < 0)
-                               goto out;
-                       if (ret == 1) {
-                               ret = 0;
+               /*
+                * check to make sure we can actually find a chunk with enough
+                * space to fit our block group in.
+                */
+               if (device->total_bytes > device->bytes_used + min_free) {
+                       ret = find_free_dev_extent(NULL, device, min_free,
+                                                  &dev_offset, &max_avail);
+                       if (!ret)
                                break;
-                       }
-                       leaf = path->nodes[0];
-                       nritems = btrfs_header_nritems(leaf);
-               }
-
-               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-
-               if (key.objectid >= block_group->key.objectid +
-                   block_group->key.offset)
-                       break;
-
-               if (progress && need_resched()) {
-                       btrfs_release_path(root, path);
-                       cond_resched();
-                       progress = 0;
-                       continue;
+                       ret = -1;
                }
-               progress = 1;
-
-               if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
-                   key.objectid + key.offset <= cur_byte) {
-                       path->slots[0]++;
-                       goto next;
-               }
-
-               total_found++;
-               cur_byte = key.objectid + key.offset;
-               btrfs_release_path(root, path);
-
-               __alloc_chunk_for_shrink(root, block_group, 0);
-               ret = relocate_one_extent(root, path, &key, block_group,
-                                         reloc_inode, pass);
-               BUG_ON(ret < 0);
-               if (ret > 0)
-                       skipped++;
-
-               key.objectid = cur_byte;
-               key.type = 0;
-               key.offset = 0;
        }
-
-       btrfs_release_path(root, path);
-
-       if (pass == 0) {
-               btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
-               invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
-       }
-
-       if (total_found > 0) {
-               printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
-                      (unsigned long long)total_found, pass);
-               pass++;
-               if (total_found == skipped && pass > 2) {
-                       iput(reloc_inode);
-                       reloc_inode = create_reloc_inode(info, block_group);
-                       pass = 0;
-               }
-               goto again;
-       }
-
-       /* delete reloc_inode */
-       iput(reloc_inode);
-
-       /* unpin extents in this range */
-       trans = btrfs_start_transaction(info->tree_root, 1);
-       btrfs_commit_transaction(trans, info->tree_root);
-
-       spin_lock(&block_group->lock);
-       WARN_ON(block_group->pinned > 0);
-       WARN_ON(block_group->reserved > 0);
-       WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
-       spin_unlock(&block_group->lock);
-       btrfs_put_block_group(block_group);
-       ret = 0;
+       mutex_unlock(&root->fs_info->chunk_mutex);
 out:
-       btrfs_free_path(path);
+       btrfs_put_block_group(block_group);
        return ret;
 }
-#endif
 
 static int find_first_block_group(struct btrfs_root *root,
                struct btrfs_path *path, struct btrfs_key *key)
@@ -7165,8 +6847,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 {
        struct btrfs_block_group_cache *block_group;
        struct btrfs_space_info *space_info;
+       struct btrfs_caching_control *caching_ctl;
        struct rb_node *n;
 
+       down_write(&info->extent_commit_sem);
+       while (!list_empty(&info->caching_block_groups)) {
+               caching_ctl = list_entry(info->caching_block_groups.next,
+                                        struct btrfs_caching_control, list);
+               list_del(&caching_ctl->list);
+               put_caching_control(caching_ctl);
+       }
+       up_write(&info->extent_commit_sem);
+
        spin_lock(&info->block_group_cache_lock);
        while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
                block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -7180,8 +6872,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                up_write(&block_group->space_info->groups_sem);
 
                if (block_group->cached == BTRFS_CACHE_STARTED)
-                       wait_event(block_group->caching_q,
-                                  block_group_cache_done(block_group));
+                       wait_block_group_cache_done(block_group);
 
                btrfs_remove_free_space_cache(block_group);
 
@@ -7251,7 +6942,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                spin_lock_init(&cache->lock);
                spin_lock_init(&cache->tree_lock);
                cache->fs_info = info;
-               init_waitqueue_head(&cache->caching_q);
                INIT_LIST_HEAD(&cache->list);
                INIT_LIST_HEAD(&cache->cluster_list);
 
@@ -7273,8 +6963,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                cache->flags = btrfs_block_group_flags(&cache->item);
                cache->sectorsize = root->sectorsize;
 
-               remove_sb_from_cache(root, cache);
-
                /*
                 * check for two cases, either we are full, and therefore
                 * don't need to bother with the caching work since we won't
@@ -7283,13 +6971,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                 * time, particularly in the full case.
                 */
                if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+                       exclude_super_stripes(root, cache);
+                       cache->last_byte_to_unpin = (u64)-1;
                        cache->cached = BTRFS_CACHE_FINISHED;
+                       free_excluded_extents(root, cache);
                } else if (btrfs_block_group_used(&cache->item) == 0) {
+                       exclude_super_stripes(root, cache);
+                       cache->last_byte_to_unpin = (u64)-1;
                        cache->cached = BTRFS_CACHE_FINISHED;
                        add_new_free_space(cache, root->fs_info,
                                           found_key.objectid,
                                           found_key.objectid +
                                           found_key.offset);
+                       free_excluded_extents(root, cache);
                }
 
                ret = update_space_info(info, cache->flags, found_key.offset,
@@ -7297,6 +6991,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                                        &space_info);
                BUG_ON(ret);
                cache->space_info = space_info;
+               spin_lock(&cache->space_info->lock);
+               cache->space_info->bytes_super += cache->bytes_super;
+               spin_unlock(&cache->space_info->lock);
+
                down_write(&space_info->groups_sem);
                list_add_tail(&cache->list, &space_info->block_groups);
                up_write(&space_info->groups_sem);
@@ -7346,7 +7044,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);
        spin_lock_init(&cache->tree_lock);
-       init_waitqueue_head(&cache->caching_q);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
 
@@ -7355,15 +7052,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->flags = type;
        btrfs_set_block_group_flags(&cache->item, type);
 
+       cache->last_byte_to_unpin = (u64)-1;
        cache->cached = BTRFS_CACHE_FINISHED;
-       remove_sb_from_cache(root, cache);
+       exclude_super_stripes(root, cache);
 
        add_new_free_space(cache, root->fs_info, chunk_offset,
                           chunk_offset + size);
 
+       free_excluded_extents(root, cache);
+
        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
                                &cache->space_info);
        BUG_ON(ret);
+
+       spin_lock(&cache->space_info->lock);
+       cache->space_info->bytes_super += cache->bytes_super;
+       spin_unlock(&cache->space_info->lock);
+
        down_write(&cache->space_info->groups_sem);
        list_add_tail(&cache->list, &cache->space_info->block_groups);
        up_write(&cache->space_info->groups_sem);
@@ -7429,8 +7134,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        up_write(&block_group->space_info->groups_sem);
 
        if (block_group->cached == BTRFS_CACHE_STARTED)
-               wait_event(block_group->caching_q,
-                          block_group_cache_done(block_group));
+               wait_block_group_cache_done(block_group);
 
        btrfs_remove_free_space_cache(block_group);
 
index 68260180f5871975b8f673df1234bf4091d91bb4..0cb88f8146ea85efa5bc944c9d95995a3d0ad772 100644 (file)
@@ -367,10 +367,10 @@ static int insert_state(struct extent_io_tree *tree,
        }
        if (bits & EXTENT_DIRTY)
                tree->dirty_bytes += end - start + 1;
-       set_state_cb(tree, state, bits);
-       state->state |= bits;
        state->start = start;
        state->end = end;
+       set_state_cb(tree, state, bits);
+       state->state |= bits;
        node = tree_insert(&tree->state, end, &state->rb_node);
        if (node) {
                struct extent_state *found;
@@ -471,10 +471,14 @@ static int clear_state_bit(struct extent_io_tree *tree,
  * bits were already set, or zero if none of the bits were already set.
  */
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                    int bits, int wake, int delete, gfp_t mask)
+                    int bits, int wake, int delete,
+                    struct extent_state **cached_state,
+                    gfp_t mask)
 {
        struct extent_state *state;
+       struct extent_state *cached;
        struct extent_state *prealloc = NULL;
+       struct rb_node *next_node;
        struct rb_node *node;
        u64 last_end;
        int err;
@@ -488,6 +492,17 @@ again:
        }
 
        spin_lock(&tree->lock);
+       if (cached_state) {
+               cached = *cached_state;
+               *cached_state = NULL;
+               cached_state = NULL;
+               if (cached && cached->tree && cached->start == start) {
+                       atomic_dec(&cached->refs);
+                       state = cached;
+                       goto hit_next;
+               }
+               free_extent_state(cached);
+       }
        /*
         * this search will find the extents that end after
         * our range starts
@@ -496,6 +511,7 @@ again:
        if (!node)
                goto out;
        state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
        if (state->start > end)
                goto out;
        WARN_ON(state->end < start);
@@ -531,8 +547,6 @@ again:
                        if (last_end == (u64)-1)
                                goto out;
                        start = last_end + 1;
-               } else {
-                       start = state->start;
                }
                goto search_again;
        }
@@ -550,16 +564,28 @@ again:
 
                if (wake)
                        wake_up(&state->wq);
+
                set |= clear_state_bit(tree, prealloc, bits,
                                       wake, delete);
                prealloc = NULL;
                goto out;
        }
 
+       if (state->end < end && prealloc && !need_resched())
+               next_node = rb_next(&state->rb_node);
+       else
+               next_node = NULL;
+
        set |= clear_state_bit(tree, state, bits, wake, delete);
        if (last_end == (u64)-1)
                goto out;
        start = last_end + 1;
+       if (start <= end && next_node) {
+               state = rb_entry(next_node, struct extent_state,
+                                rb_node);
+               if (state->start == start)
+                       goto hit_next;
+       }
        goto search_again;
 
 out:
@@ -653,28 +679,40 @@ static void set_state_bits(struct extent_io_tree *tree,
        state->state |= bits;
 }
 
+static void cache_state(struct extent_state *state,
+                       struct extent_state **cached_ptr)
+{
+       if (cached_ptr && !(*cached_ptr)) {
+               if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
+                       *cached_ptr = state;
+                       atomic_inc(&state->refs);
+               }
+       }
+}
+
 /*
- * set some bits on a range in the tree.  This may require allocations
- * or sleeping, so the gfp mask is used to indicate what is allowed.
+ * set some bits on a range in the tree.  This may require allocations or
+ * sleeping, so the gfp mask is used to indicate what is allowed.
  *
- * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
- * range already has the desired bits set.  The start of the existing
- * range is returned in failed_start in this case.
+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
+ * part of the range already has the desired bits set.  The start of the
+ * existing range is returned in failed_start in this case.
  *
- * [start, end] is inclusive
- * This takes the tree lock.
+ * [start, end] is inclusive This takes the tree lock.
  */
+
 static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                         int bits, int exclusive, u64 *failed_start,
+                         int bits, int exclusive_bits, u64 *failed_start,
+                         struct extent_state **cached_state,
                          gfp_t mask)
 {
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
        struct rb_node *node;
        int err = 0;
-       int set;
        u64 last_start;
        u64 last_end;
+
 again:
        if (!prealloc && (mask & __GFP_WAIT)) {
                prealloc = alloc_extent_state(mask);
@@ -683,6 +721,13 @@ again:
        }
 
        spin_lock(&tree->lock);
+       if (cached_state && *cached_state) {
+               state = *cached_state;
+               if (state->start == start && state->tree) {
+                       node = &state->rb_node;
+                       goto hit_next;
+               }
+       }
        /*
         * this search will find all the extents that end after
         * our range starts.
@@ -694,8 +739,8 @@ again:
                BUG_ON(err == -EEXIST);
                goto out;
        }
-
        state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
        last_start = state->start;
        last_end = state->end;
 
@@ -706,17 +751,29 @@ again:
         * Just lock what we found and keep going
         */
        if (state->start == start && state->end <= end) {
-               set = state->state & bits;
-               if (set && exclusive) {
+               struct rb_node *next_node;
+               if (state->state & exclusive_bits) {
                        *failed_start = state->start;
                        err = -EEXIST;
                        goto out;
                }
+
                set_state_bits(tree, state, bits);
+               cache_state(state, cached_state);
                merge_state(tree, state);
                if (last_end == (u64)-1)
                        goto out;
+
                start = last_end + 1;
+               if (start < end && prealloc && !need_resched()) {
+                       next_node = rb_next(node);
+                       if (next_node) {
+                               state = rb_entry(next_node, struct extent_state,
+                                                rb_node);
+                               if (state->start == start)
+                                       goto hit_next;
+                       }
+               }
                goto search_again;
        }
 
@@ -737,8 +794,7 @@ again:
         * desired bit on it.
         */
        if (state->start < start) {
-               set = state->state & bits;
-               if (exclusive && set) {
+               if (state->state & exclusive_bits) {
                        *failed_start = start;
                        err = -EEXIST;
                        goto out;
@@ -750,12 +806,11 @@ again:
                        goto out;
                if (state->end <= end) {
                        set_state_bits(tree, state, bits);
+                       cache_state(state, cached_state);
                        merge_state(tree, state);
                        if (last_end == (u64)-1)
                                goto out;
                        start = last_end + 1;
-               } else {
-                       start = state->start;
                }
                goto search_again;
        }
@@ -774,6 +829,7 @@ again:
                        this_end = last_start - 1;
                err = insert_state(tree, prealloc, start, this_end,
                                   bits);
+               cache_state(prealloc, cached_state);
                prealloc = NULL;
                BUG_ON(err == -EEXIST);
                if (err)
@@ -788,8 +844,7 @@ again:
         * on the first half
         */
        if (state->start <= end && state->end > end) {
-               set = state->state & bits;
-               if (exclusive && set) {
+               if (state->state & exclusive_bits) {
                        *failed_start = start;
                        err = -EEXIST;
                        goto out;
@@ -798,6 +853,7 @@ again:
                BUG_ON(err == -EEXIST);
 
                set_state_bits(tree, prealloc, bits);
+               cache_state(prealloc, cached_state);
                merge_state(tree, prealloc);
                prealloc = NULL;
                goto out;
@@ -826,86 +882,64 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask)
 {
        return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
-                             mask);
-}
-
-int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
-                      gfp_t mask)
-{
-       return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
+                             NULL, mask);
 }
 
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                    int bits, gfp_t mask)
 {
        return set_extent_bit(tree, start, end, bits, 0, NULL,
-                             mask);
+                             NULL, mask);
 }
 
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                      int bits, gfp_t mask)
 {
-       return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
+       return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
 }
 
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask)
 {
        return set_extent_bit(tree, start, end,
-                             EXTENT_DELALLOC | EXTENT_DIRTY,
-                             0, NULL, mask);
+                             EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+                             0, NULL, NULL, mask);
 }
 
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                       gfp_t mask)
 {
        return clear_extent_bit(tree, start, end,
-                               EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
-}
-
-int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
-                        gfp_t mask)
-{
-       return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
+                               EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+                               NULL, mask);
 }
 
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask)
 {
        return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
-                             mask);
+                             NULL, mask);
 }
 
 static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
                       gfp_t mask)
 {
-       return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
+       return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
+                               NULL, mask);
 }
 
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
                        gfp_t mask)
 {
        return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
-                             mask);
+                             NULL, mask);
 }
 
 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
                                 u64 end, gfp_t mask)
 {
-       return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
-}
-
-static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
-                        gfp_t mask)
-{
-       return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
-                             0, NULL, mask);
-}
-
-static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
-                                 u64 end, gfp_t mask)
-{
-       return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
+       return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+                               NULL, mask);
 }
 
 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -917,13 +951,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
  * either insert or lock state struct between start and end use mask to tell
  * us if waiting is desired.
  */
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                    int bits, struct extent_state **cached_state, gfp_t mask)
 {
        int err;
        u64 failed_start;
        while (1) {
-               err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
-                                    &failed_start, mask);
+               err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+                                    EXTENT_LOCKED, &failed_start,
+                                    cached_state, mask);
                if (err == -EEXIST && (mask & __GFP_WAIT)) {
                        wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
                        start = failed_start;
@@ -935,27 +971,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
        return err;
 }
 
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+       return lock_extent_bits(tree, start, end, 0, NULL, mask);
+}
+
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
                    gfp_t mask)
 {
        int err;
        u64 failed_start;
 
-       err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
-                            &failed_start, mask);
+       err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+                            &failed_start, NULL, mask);
        if (err == -EEXIST) {
                if (failed_start > start)
                        clear_extent_bit(tree, start, failed_start - 1,
-                                        EXTENT_LOCKED, 1, 0, mask);
+                                        EXTENT_LOCKED, 1, 0, NULL, mask);
                return 0;
        }
        return 1;
 }
 
+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
+                        struct extent_state **cached, gfp_t mask)
+{
+       return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+                               mask);
+}
+
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
                  gfp_t mask)
 {
-       return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
+       return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
+                               mask);
 }
 
 /*
@@ -974,7 +1023,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
                page_cache_release(page);
                index++;
        }
-       set_extent_dirty(tree, start, end, GFP_NOFS);
        return 0;
 }
 
@@ -994,7 +1042,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
                page_cache_release(page);
                index++;
        }
-       set_extent_writeback(tree, start, end, GFP_NOFS);
        return 0;
 }
 
@@ -1232,6 +1279,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
        u64 delalloc_start;
        u64 delalloc_end;
        u64 found;
+       struct extent_state *cached_state = NULL;
        int ret;
        int loops = 0;
 
@@ -1269,6 +1317,7 @@ again:
                /* some of the pages are gone, lets avoid looping by
                 * shortening the size of the delalloc range we're searching
                 */
+               free_extent_state(cached_state);
                if (!loops) {
                        unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
                        max_bytes = PAGE_CACHE_SIZE - offset;
@@ -1282,18 +1331,21 @@ again:
        BUG_ON(ret);
 
        /* step three, lock the state bits for the whole range */
-       lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+       lock_extent_bits(tree, delalloc_start, delalloc_end,
+                        0, &cached_state, GFP_NOFS);
 
        /* then test to make sure it is all still delalloc */
        ret = test_range_bit(tree, delalloc_start, delalloc_end,
-                            EXTENT_DELALLOC, 1);
+                            EXTENT_DELALLOC, 1, cached_state);
        if (!ret) {
-               unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+               unlock_extent_cached(tree, delalloc_start, delalloc_end,
+                                    &cached_state, GFP_NOFS);
                __unlock_for_delalloc(inode, locked_page,
                              delalloc_start, delalloc_end);
                cond_resched();
                goto again;
        }
+       free_extent_state(cached_state);
        *start = delalloc_start;
        *end = delalloc_end;
 out_failed:
@@ -1307,7 +1359,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
                                int clear_unlock,
                                int clear_delalloc, int clear_dirty,
                                int set_writeback,
-                               int end_writeback)
+                               int end_writeback,
+                               int set_private2)
 {
        int ret;
        struct page *pages[16];
@@ -1325,8 +1378,9 @@ int extent_clear_unlock_delalloc(struct inode *inode,
        if (clear_delalloc)
                clear_bits |= EXTENT_DELALLOC;
 
-       clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
-       if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
+       clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
+       if (!(unlock_pages || clear_dirty || set_writeback || end_writeback ||
+             set_private2))
                return 0;
 
        while (nr_pages > 0) {
@@ -1334,6 +1388,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
                                     min_t(unsigned long,
                                     nr_pages, ARRAY_SIZE(pages)), pages);
                for (i = 0; i < ret; i++) {
+
+                       if (set_private2)
+                               SetPagePrivate2(pages[i]);
+
                        if (pages[i] == locked_page) {
                                page_cache_release(pages[i]);
                                continue;
@@ -1476,14 +1534,17 @@ out:
  * range is found set.
  */
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                  int bits, int filled)
+                  int bits, int filled, struct extent_state *cached)
 {
        struct extent_state *state = NULL;
        struct rb_node *node;
        int bitset = 0;
 
        spin_lock(&tree->lock);
-       node = tree_search(tree, start);
+       if (cached && cached->tree && cached->start == start)
+               node = &cached->rb_node;
+       else
+               node = tree_search(tree, start);
        while (node && start <= end) {
                state = rb_entry(node, struct extent_state, rb_node);
 
@@ -1503,6 +1564,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
                        bitset = 0;
                        break;
                }
+
+               if (state->end == (u64)-1)
+                       break;
+
                start = state->end + 1;
                if (start > end)
                        break;
@@ -1526,7 +1591,7 @@ static int check_page_uptodate(struct extent_io_tree *tree,
 {
        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
        u64 end = start + PAGE_CACHE_SIZE - 1;
-       if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
+       if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
                SetPageUptodate(page);
        return 0;
 }
@@ -1540,7 +1605,7 @@ static int check_page_locked(struct extent_io_tree *tree,
 {
        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
        u64 end = start + PAGE_CACHE_SIZE - 1;
-       if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
+       if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
                unlock_page(page);
        return 0;
 }
@@ -1552,10 +1617,7 @@ static int check_page_locked(struct extent_io_tree *tree,
 static int check_page_writeback(struct extent_io_tree *tree,
                             struct page *page)
 {
-       u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-       u64 end = start + PAGE_CACHE_SIZE - 1;
-       if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
-               end_page_writeback(page);
+       end_page_writeback(page);
        return 0;
 }
 
@@ -1613,13 +1675,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
                }
 
                if (!uptodate) {
-                       clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
+                       clear_extent_uptodate(tree, start, end, GFP_NOFS);
                        ClearPageUptodate(page);
                        SetPageError(page);
                }
 
-               clear_extent_writeback(tree, start, end, GFP_ATOMIC);
-
                if (whole_page)
                        end_page_writeback(page);
                else
@@ -1983,7 +2043,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                        continue;
                }
                /* the get_extent function already copied into the page */
-               if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
+               if (test_range_bit(tree, cur, cur_end,
+                                  EXTENT_UPTODATE, 1, NULL)) {
                        check_page_uptodate(tree, page);
                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
                        cur = cur + iosize;
@@ -2078,6 +2139,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        u64 iosize;
        u64 unlock_start;
        sector_t sector;
+       struct extent_state *cached_state = NULL;
        struct extent_map *em;
        struct block_device *bdev;
        int ret;
@@ -2124,6 +2186,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        delalloc_end = 0;
        page_started = 0;
        if (!epd->extent_locked) {
+               u64 delalloc_to_write = 0;
                /*
                 * make sure the wbc mapping index is at least updated
                 * to this page.
@@ -2143,8 +2206,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                        tree->ops->fill_delalloc(inode, page, delalloc_start,
                                                 delalloc_end, &page_started,
                                                 &nr_written);
+                       /*
+                        * delalloc_end is already one less than the total
+                        * length, so we don't subtract one from
+                        * PAGE_CACHE_SIZE
+                        */
+                       delalloc_to_write += (delalloc_end - delalloc_start +
+                                             PAGE_CACHE_SIZE) >>
+                                             PAGE_CACHE_SHIFT;
                        delalloc_start = delalloc_end + 1;
                }
+               if (wbc->nr_to_write < delalloc_to_write) {
+                       int thresh = 8192;
+
+                       if (delalloc_to_write < thresh * 2)
+                               thresh = delalloc_to_write;
+                       wbc->nr_to_write = min_t(u64, delalloc_to_write,
+                                                thresh);
+               }
 
                /* did the fill delalloc function already unlock and start
                 * the IO?
@@ -2160,15 +2239,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                        goto done_unlocked;
                }
        }
-       lock_extent(tree, start, page_end, GFP_NOFS);
-
-       unlock_start = start;
-
        if (tree->ops && tree->ops->writepage_start_hook) {
                ret = tree->ops->writepage_start_hook(page, start,
                                                      page_end);
                if (ret == -EAGAIN) {
-                       unlock_extent(tree, start, page_end, GFP_NOFS);
                        redirty_page_for_writepage(wbc, page);
                        update_nr_written(page, wbc, nr_written);
                        unlock_page(page);
@@ -2184,12 +2258,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        update_nr_written(page, wbc, nr_written + 1);
 
        end = page_end;
-       if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
-               printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
-
        if (last_byte <= start) {
-               clear_extent_dirty(tree, start, page_end, GFP_NOFS);
-               unlock_extent(tree, start, page_end, GFP_NOFS);
                if (tree->ops && tree->ops->writepage_end_io_hook)
                        tree->ops->writepage_end_io_hook(page, start,
                                                         page_end, NULL, 1);
@@ -2197,13 +2266,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                goto done;
        }
 
-       set_extent_uptodate(tree, start, page_end, GFP_NOFS);
        blocksize = inode->i_sb->s_blocksize;
 
        while (cur <= end) {
                if (cur >= last_byte) {
-                       clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
-                       unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
                        if (tree->ops && tree->ops->writepage_end_io_hook)
                                tree->ops->writepage_end_io_hook(page, cur,
                                                         page_end, NULL, 1);
@@ -2235,12 +2301,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                 */
                if (compressed || block_start == EXTENT_MAP_HOLE ||
                    block_start == EXTENT_MAP_INLINE) {
-                       clear_extent_dirty(tree, cur,
-                                          cur + iosize - 1, GFP_NOFS);
-
-                       unlock_extent(tree, unlock_start, cur + iosize - 1,
-                                     GFP_NOFS);
-
                        /*
                         * end_io notification does not happen here for
                         * compressed extents
@@ -2265,13 +2325,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                }
                /* leave this out until we have a page_mkwrite call */
                if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
-                                  EXTENT_DIRTY, 0)) {
+                                  EXTENT_DIRTY, 0, NULL)) {
                        cur = cur + iosize;
                        pg_offset += iosize;
                        continue;
                }
 
-               clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
                if (tree->ops && tree->ops->writepage_io_hook) {
                        ret = tree->ops->writepage_io_hook(page, cur,
                                                cur + iosize - 1);
@@ -2309,12 +2368,12 @@ done:
                set_page_writeback(page);
                end_page_writeback(page);
        }
-       if (unlock_start <= page_end)
-               unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
        unlock_page(page);
 
 done_unlocked:
 
+       /* drop our reference on any cached states */
+       free_extent_state(cached_state);
        return 0;
 }
 
@@ -2339,9 +2398,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
                             writepage_t writepage, void *data,
                             void (*flush_fn)(void *))
 {
-       struct backing_dev_info *bdi = mapping->backing_dev_info;
        int ret = 0;
        int done = 0;
+       int nr_to_write_done = 0;
        struct pagevec pvec;
        int nr_pages;
        pgoff_t index;
@@ -2361,7 +2420,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
                scanned = 1;
        }
 retry:
-       while (!done && (index <= end) &&
+       while (!done && !nr_to_write_done && (index <= end) &&
               (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
                              PAGECACHE_TAG_DIRTY, min(end - index,
                                  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
@@ -2412,12 +2471,15 @@ retry:
                                unlock_page(page);
                                ret = 0;
                        }
-                       if (ret || wbc->nr_to_write <= 0)
-                               done = 1;
-                       if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                               wbc->encountered_congestion = 1;
+                       if (ret)
                                done = 1;
-                       }
+
+                       /*
+                        * the filesystem may choose to bump up nr_to_write.
+                        * We have to make sure to honor the new nr_to_write
+                        * at any time
+                        */
+                       nr_to_write_done = wbc->nr_to_write <= 0;
                }
                pagevec_release(&pvec);
                cond_resched();
@@ -2604,10 +2666,10 @@ int extent_invalidatepage(struct extent_io_tree *tree,
                return 0;
 
        lock_extent(tree, start, end, GFP_NOFS);
-       wait_on_extent_writeback(tree, start, end);
+       wait_on_page_writeback(page);
        clear_extent_bit(tree, start, end,
                         EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
-                        1, 1, GFP_NOFS);
+                        1, 1, NULL, GFP_NOFS);
        return 0;
 }
 
@@ -2687,7 +2749,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
                    !isnew && !PageUptodate(page) &&
                    (block_off_end > to || block_off_start < from) &&
                    !test_range_bit(tree, block_start, cur_end,
-                                   EXTENT_UPTODATE, 1)) {
+                                   EXTENT_UPTODATE, 1, NULL)) {
                        u64 sector;
                        u64 extent_offset = block_start - em->start;
                        size_t iosize;
@@ -2701,7 +2763,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
                         */
                        set_extent_bit(tree, block_start,
                                       block_start + iosize - 1,
-                                      EXTENT_LOCKED, 0, NULL, GFP_NOFS);
+                                      EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
                        ret = submit_extent_page(READ, tree, page,
                                         sector, iosize, page_offset, em->bdev,
                                         NULL, 1,
@@ -2742,13 +2804,18 @@ int try_release_extent_state(struct extent_map_tree *map,
        int ret = 1;
 
        if (test_range_bit(tree, start, end,
-                          EXTENT_IOBITS | EXTENT_ORDERED, 0))
+                          EXTENT_IOBITS, 0, NULL))
                ret = 0;
        else {
                if ((mask & GFP_NOFS) == GFP_NOFS)
                        mask = GFP_NOFS;
-               clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
-                                1, 1, mask);
+               /*
+                * at this point we can safely clear everything except the
+                * locked bit and the nodatasum bit
+                */
+               clear_extent_bit(tree, start, end,
+                                ~(EXTENT_LOCKED | EXTENT_NODATASUM),
+                                0, 0, NULL, mask);
        }
        return ret;
 }
@@ -2771,29 +2838,28 @@ int try_release_extent_mapping(struct extent_map_tree *map,
                u64 len;
                while (start <= end) {
                        len = end - start + 1;
-                       spin_lock(&map->lock);
+                       write_lock(&map->lock);
                        em = lookup_extent_mapping(map, start, len);
                        if (!em || IS_ERR(em)) {
-                               spin_unlock(&map->lock);
+                               write_unlock(&map->lock);
                                break;
                        }
                        if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
                            em->start != start) {
-                               spin_unlock(&map->lock);
+                               write_unlock(&map->lock);
                                free_extent_map(em);
                                break;
                        }
                        if (!test_range_bit(tree, em->start,
                                            extent_map_end(em) - 1,
-                                           EXTENT_LOCKED | EXTENT_WRITEBACK |
-                                           EXTENT_ORDERED,
-                                           0)) {
+                                           EXTENT_LOCKED | EXTENT_WRITEBACK,
+                                           0, NULL)) {
                                remove_extent_mapping(map, em);
                                /* once for the rb tree */
                                free_extent_map(em);
                        }
                        start = extent_map_end(em);
-                       spin_unlock(&map->lock);
+                       write_unlock(&map->lock);
 
                        /* once for us */
                        free_extent_map(em);
@@ -3203,7 +3269,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
        int uptodate;
        unsigned long index;
 
-       ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
+       ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
        if (ret)
                return 1;
        while (start <= end) {
@@ -3233,7 +3299,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
                return 1;
 
        ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-                          EXTENT_UPTODATE, 1);
+                          EXTENT_UPTODATE, 1, NULL);
        if (ret)
                return ret;
 
@@ -3269,7 +3335,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
                return 0;
 
        if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-                          EXTENT_UPTODATE, 1)) {
+                          EXTENT_UPTODATE, 1, NULL)) {
                return 0;
        }
 
index 5bc20abf3f3d340b22248802e4c8d7d72f4dc4bb..14ed16fd862df22a93b7286c4c4811a8c12fa6ba 100644 (file)
 #define EXTENT_DEFRAG (1 << 6)
 #define EXTENT_DEFRAG_DONE (1 << 7)
 #define EXTENT_BUFFER_FILLED (1 << 8)
-#define EXTENT_ORDERED (1 << 9)
-#define EXTENT_ORDERED_METADATA (1 << 10)
-#define EXTENT_BOUNDARY (1 << 11)
-#define EXTENT_NODATASUM (1 << 12)
+#define EXTENT_BOUNDARY (1 << 9)
+#define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
 /* flags for bio submission */
@@ -142,6 +140,8 @@ int try_release_extent_state(struct extent_map_tree *map,
                             struct extent_io_tree *tree, struct page *page,
                             gfp_t mask);
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                    int bits, struct extent_state **cached, gfp_t mask);
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
                    gfp_t mask);
@@ -155,11 +155,12 @@ u64 count_range_bits(struct extent_io_tree *tree,
                     u64 max_bytes, unsigned long bits);
 
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                  int bits, int filled);
+                  int bits, int filled, struct extent_state *cached_state);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                      int bits, gfp_t mask);
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                    int bits, int wake, int delete, gfp_t mask);
+                    int bits, int wake, int delete, struct extent_state **cached,
+                    gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                    int bits, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
@@ -282,5 +283,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
                                int clear_unlock,
                                int clear_delalloc, int clear_dirty,
                                int set_writeback,
-                               int end_writeback);
+                               int end_writeback,
+                               int set_private2);
 #endif
index 30c9365861e69602f96e4debc96bacf8889028e1..2c726b7b9faa17af725b8a5b8319148758b7dd91 100644 (file)
@@ -36,7 +36,7 @@ void extent_map_exit(void)
 void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
 {
        tree->map.rb_node = NULL;
-       spin_lock_init(&tree->lock);
+       rwlock_init(&tree->lock);
 }
 
 /**
@@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
        return 0;
 }
 
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+{
+       int ret = 0;
+       struct extent_map *merge = NULL;
+       struct rb_node *rb;
+       struct extent_map *em;
+
+       write_lock(&tree->lock);
+       em = lookup_extent_mapping(tree, start, len);
+
+       WARN_ON(em->start != start || !em);
+
+       if (!em)
+               goto out;
+
+       clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+       if (em->start != 0) {
+               rb = rb_prev(&em->rb_node);
+               if (rb)
+                       merge = rb_entry(rb, struct extent_map, rb_node);
+               if (rb && mergable_maps(merge, em)) {
+                       em->start = merge->start;
+                       em->len += merge->len;
+                       em->block_len += merge->block_len;
+                       em->block_start = merge->block_start;
+                       merge->in_tree = 0;
+                       rb_erase(&merge->rb_node, &tree->map);
+                       free_extent_map(merge);
+               }
+       }
+
+       rb = rb_next(&em->rb_node);
+       if (rb)
+               merge = rb_entry(rb, struct extent_map, rb_node);
+       if (rb && mergable_maps(em, merge)) {
+               em->len += merge->len;
+               em->block_len += merge->len;
+               rb_erase(&merge->rb_node, &tree->map);
+               merge->in_tree = 0;
+               free_extent_map(merge);
+       }
+
+       free_extent_map(em);
+out:
+       write_unlock(&tree->lock);
+       return ret;
+
+}
+
 /**
  * add_extent_mapping - add new extent map to the extent tree
  * @tree:      tree to insert new map in
@@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
                ret = -EEXIST;
                goto out;
        }
-       assert_spin_locked(&tree->lock);
        rb = tree_insert(&tree->map, em->start, &em->rb_node);
        if (rb) {
                ret = -EEXIST;
@@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
        struct rb_node *next = NULL;
        u64 end = range_end(start, len);
 
-       assert_spin_locked(&tree->lock);
        rb_node = __tree_search(&tree->map, start, &prev, &next);
        if (!rb_node && prev) {
                em = rb_entry(prev, struct extent_map, rb_node);
@@ -318,6 +366,54 @@ out:
        return em;
 }
 
+/**
+ * search_extent_mapping - find a nearby extent map
+ * @tree:      tree to lookup in
+ * @start:     byte offset to start the search
+ * @len:       length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range.
+ *
+ * If one can't be found, any nearby extent may be returned
+ */
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+                                        u64 start, u64 len)
+{
+       struct extent_map *em;
+       struct rb_node *rb_node;
+       struct rb_node *prev = NULL;
+       struct rb_node *next = NULL;
+
+       rb_node = __tree_search(&tree->map, start, &prev, &next);
+       if (!rb_node && prev) {
+               em = rb_entry(prev, struct extent_map, rb_node);
+               goto found;
+       }
+       if (!rb_node && next) {
+               em = rb_entry(next, struct extent_map, rb_node);
+               goto found;
+       }
+       if (!rb_node) {
+               em = NULL;
+               goto out;
+       }
+       if (IS_ERR(rb_node)) {
+               em = ERR_PTR(PTR_ERR(rb_node));
+               goto out;
+       }
+       em = rb_entry(rb_node, struct extent_map, rb_node);
+       goto found;
+
+       em = NULL;
+       goto out;
+
+found:
+       atomic_inc(&em->refs);
+out:
+       return em;
+}
+
 /**
  * remove_extent_mapping - removes an extent_map from the extent tree
  * @tree:      extent tree to remove from
@@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
        int ret = 0;
 
        WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
-       assert_spin_locked(&tree->lock);
        rb_erase(&em->rb_node, &tree->map);
        em->in_tree = 0;
        return ret;
index fb6eeef06bb0a88abcf9297c2c6d9156a35af552..ab6d74b6e6477dcfb1bf65df494c5749cc8d80d9 100644 (file)
@@ -31,7 +31,7 @@ struct extent_map {
 
 struct extent_map_tree {
        struct rb_root map;
-       spinlock_t lock;
+       rwlock_t lock;
 };
 
 static inline u64 extent_map_end(struct extent_map *em)
@@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask);
 void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
 void extent_map_exit(void);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+                                        u64 start, u64 len);
 #endif
index 4b833972273a75218eb775cf8caf64dc1be80ed7..571ad3c13b47be3de640f4ce84253f3ce593b26d 100644 (file)
@@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
        int err = 0;
        int i;
        struct inode *inode = fdentry(file)->d_inode;
-       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-       u64 hint_byte;
        u64 num_bytes;
        u64 start_pos;
        u64 end_of_last_block;
@@ -125,22 +123,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
                    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 
        end_of_last_block = start_pos + num_bytes - 1;
-
-       lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
-       trans = btrfs_join_transaction(root, 1);
-       if (!trans) {
-               err = -ENOMEM;
-               goto out_unlock;
-       }
-       btrfs_set_trans_block_group(trans, inode);
-       hint_byte = 0;
-
-       set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
-
-       /* check for reserved extents on each page, we don't want
-        * to reset the delalloc bit on things that already have
-        * extents reserved.
-        */
        btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
        for (i = 0; i < num_pages; i++) {
                struct page *p = pages[i];
@@ -155,9 +137,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
                 * at this time.
                 */
        }
-       err = btrfs_end_transaction(trans, root);
-out_unlock:
-       unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
        return err;
 }
 
@@ -189,18 +168,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                if (!split2)
                        split2 = alloc_extent_map(GFP_NOFS);
 
-               spin_lock(&em_tree->lock);
+               write_lock(&em_tree->lock);
                em = lookup_extent_mapping(em_tree, start, len);
                if (!em) {
-                       spin_unlock(&em_tree->lock);
+                       write_unlock(&em_tree->lock);
                        break;
                }
                flags = em->flags;
                if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
-                       spin_unlock(&em_tree->lock);
                        if (em->start <= start &&
                            (!testend || em->start + em->len >= start + len)) {
                                free_extent_map(em);
+                               write_unlock(&em_tree->lock);
                                break;
                        }
                        if (start < em->start) {
@@ -210,6 +189,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                                start = em->start + em->len;
                        }
                        free_extent_map(em);
+                       write_unlock(&em_tree->lock);
                        continue;
                }
                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
@@ -260,7 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        free_extent_map(split);
                        split = NULL;
                }
-               spin_unlock(&em_tree->lock);
+               write_unlock(&em_tree->lock);
 
                /* once for us */
                free_extent_map(em);
@@ -289,7 +269,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct inode *inode,
                       u64 start, u64 end, u64 locked_end,
-                      u64 inline_limit, u64 *hint_byte)
+                      u64 inline_limit, u64 *hint_byte, int drop_cache)
 {
        u64 extent_end = 0;
        u64 search_start = start;
@@ -314,7 +294,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
        int ret;
 
        inline_limit = 0;
-       btrfs_drop_extent_cache(inode, start, end - 1, 0);
+       if (drop_cache)
+               btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
        path = btrfs_alloc_path();
        if (!path)
index 5edcee3a617f44e4608cefd709a6e8d7c38e14a5..5c2caad76212d2b5bfcc4329e255978525d6f5ed 100644 (file)
@@ -259,7 +259,9 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 
 static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
 {
-       u64 max_bytes, possible_bytes;
+       u64 max_bytes;
+       u64 bitmap_bytes;
+       u64 extent_bytes;
 
        /*
         * The goal is to keep the total amount of memory used per 1gb of space
@@ -269,22 +271,27 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
        max_bytes = MAX_CACHE_BYTES_PER_GIG *
                (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
 
-       possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
-               (sizeof(struct btrfs_free_space) *
-                block_group->extents_thresh);
+       /*
+        * we want to account for 1 more bitmap than what we have so we can make
+        * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
+        * we add more bitmaps.
+        */
+       bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
 
-       if (possible_bytes > max_bytes) {
-               int extent_bytes = max_bytes -
-                       (block_group->total_bitmaps * PAGE_CACHE_SIZE);
+       if (bitmap_bytes >= max_bytes) {
+               block_group->extents_thresh = 0;
+               return;
+       }
 
-               if (extent_bytes <= 0) {
-                       block_group->extents_thresh = 0;
-                       return;
-               }
+       /*
+        * we want the extent entry threshold to always be at most 1/2 the maxw
+        * bytes we can have, or whatever is less than that.
+        */
+       extent_bytes = max_bytes - bitmap_bytes;
+       extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
 
-               block_group->extents_thresh = extent_bytes /
-                       (sizeof(struct btrfs_free_space));
-       }
+       block_group->extents_thresh =
+               div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
 }
 
 static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
@@ -403,6 +410,7 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
        BUG_ON(block_group->total_bitmaps >= max_bitmaps);
 
        info->offset = offset_to_bitmap(block_group, offset);
+       info->bytes = 0;
        link_free_space(block_group, info);
        block_group->total_bitmaps++;
 
index 6b627c6118081ccea87d1acaf92000f995dda946..72ce3c173d6a3d111826e40ce8421b8f7f57f9b1 100644 (file)
@@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
                ptr = (unsigned long)(ref + 1);
                ret = 0;
        } else if (ret < 0) {
+               if (ret == -EOVERFLOW)
+                       ret = -EMLINK;
                goto out;
        } else {
                ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
 
        ret = btrfs_insert_empty_item(trans, root, path, &key,
                                      sizeof(struct btrfs_inode_item));
-       if (ret == 0 && objectid > root->highest_inode)
-               root->highest_inode = objectid;
        return ret;
 }
 
index 9abbced1123dc67139e2de6af415c0b41cfcfda5..c56eb5909172956da6354eef29ea36f7f698a8fc 100644 (file)
@@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
                slot = path->slots[0] - 1;
                l = path->nodes[0];
                btrfs_item_key_to_cpu(l, &found_key, slot);
-               *objectid = found_key.objectid;
+               *objectid = max_t(u64, found_key.objectid,
+                                 BTRFS_FIRST_FREE_OBJECTID - 1);
        } else {
-               *objectid = BTRFS_FIRST_FREE_OBJECTID;
+               *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
        }
        ret = 0;
 error:
@@ -53,91 +54,27 @@ error:
        return ret;
 }
 
-/*
- * walks the btree of allocated inodes and find a hole.
- */
 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             u64 dirid, u64 *objectid)
 {
-       struct btrfs_path *path;
-       struct btrfs_key key;
        int ret;
-       int slot = 0;
-       u64 last_ino = 0;
-       int start_found;
-       struct extent_buffer *l;
-       struct btrfs_key search_key;
-       u64 search_start = dirid;
-
        mutex_lock(&root->objectid_mutex);
-       if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
-           root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
-               *objectid = ++root->last_inode_alloc;
-               mutex_unlock(&root->objectid_mutex);
-               return 0;
-       }
-       path = btrfs_alloc_path();
-       BUG_ON(!path);
-       search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
-       search_key.objectid = search_start;
-       search_key.type = 0;
-       search_key.offset = 0;
-
-       start_found = 0;
-       ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
-       if (ret < 0)
-               goto error;
 
-       while (1) {
-               l = path->nodes[0];
-               slot = path->slots[0];
-               if (slot >= btrfs_header_nritems(l)) {
-                       ret = btrfs_next_leaf(root, path);
-                       if (ret == 0)
-                               continue;
-                       if (ret < 0)
-                               goto error;
-                       if (!start_found) {
-                               *objectid = search_start;
-                               start_found = 1;
-                               goto found;
-                       }
-                       *objectid = last_ino > search_start ?
-                               last_ino : search_start;
-                       goto found;
-               }
-               btrfs_item_key_to_cpu(l, &key, slot);
-               if (key.objectid >= search_start) {
-                       if (start_found) {
-                               if (last_ino < search_start)
-                                       last_ino = search_start;
-                               if (key.objectid > last_ino) {
-                                       *objectid = last_ino;
-                                       goto found;
-                               }
-                       } else if (key.objectid > search_start) {
-                               *objectid = search_start;
-                               goto found;
-                       }
-               }
-               if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
-                       break;
+       if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
+               ret = btrfs_find_highest_inode(root, &root->highest_objectid);
+               if (ret)
+                       goto out;
+       }
 
-               start_found = 1;
-               last_ino = key.objectid + 1;
-               path->slots[0]++;
+       if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+               ret = -ENOSPC;
+               goto out;
        }
-       BUG_ON(1);
-found:
-       btrfs_release_path(root, path);
-       btrfs_free_path(path);
-       BUG_ON(*objectid < search_start);
-       mutex_unlock(&root->objectid_mutex);
-       return 0;
-error:
-       btrfs_release_path(root, path);
-       btrfs_free_path(path);
+
+       *objectid = ++root->highest_objectid;
+       ret = 0;
+out:
        mutex_unlock(&root->objectid_mutex);
        return ret;
 }
index 9096fd0ca3ca447a7195f489f03d862340173494..976bfda032e062cc8958e79ea36a60c9df6a8ba1 100644 (file)
@@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
        }
 
        ret = btrfs_drop_extents(trans, root, inode, start,
-                                aligned_end, aligned_end, start, &hint_byte);
+                                aligned_end, aligned_end, start,
+                                &hint_byte, 1);
        BUG_ON(ret);
 
        if (isize > actual_end)
@@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                                   inline_len, compressed_size,
                                   compressed_pages);
        BUG_ON(ret);
-       btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+       btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
        return 0;
 }
 
@@ -425,7 +426,7 @@ again:
                        extent_clear_unlock_delalloc(inode,
                                                     &BTRFS_I(inode)->io_tree,
                                                     start, end, NULL, 1, 0,
-                                                    0, 1, 1, 1);
+                                                    0, 1, 1, 1, 0);
                        ret = 0;
                        goto free_pages_out;
                }
@@ -611,9 +612,9 @@ static noinline int submit_compressed_extents(struct inode *inode,
                set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 
                while (1) {
-                       spin_lock(&em_tree->lock);
+                       write_lock(&em_tree->lock);
                        ret = add_extent_mapping(em_tree, em);
-                       spin_unlock(&em_tree->lock);
+                       write_unlock(&em_tree->lock);
                        if (ret != -EEXIST) {
                                free_extent_map(em);
                                break;
@@ -640,7 +641,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
                                             async_extent->start,
                                             async_extent->start +
                                             async_extent->ram_size - 1,
-                                            NULL, 1, 1, 0, 1, 1, 0);
+                                            NULL, 1, 1, 0, 1, 1, 0, 0);
 
                ret = btrfs_submit_compressed_write(inode,
                                    async_extent->start,
@@ -713,7 +714,7 @@ static noinline int cow_file_range(struct inode *inode,
                        extent_clear_unlock_delalloc(inode,
                                                     &BTRFS_I(inode)->io_tree,
                                                     start, end, NULL, 1, 1,
-                                                    1, 1, 1, 1);
+                                                    1, 1, 1, 1, 0);
                        *nr_written = *nr_written +
                             (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
                        *page_started = 1;
@@ -725,6 +726,15 @@ static noinline int cow_file_range(struct inode *inode,
        BUG_ON(disk_num_bytes >
               btrfs_super_total_bytes(&root->fs_info->super_copy));
 
+
+       read_lock(&BTRFS_I(inode)->extent_tree.lock);
+       em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
+                                  start, num_bytes);
+       if (em) {
+               alloc_hint = em->block_start;
+               free_extent_map(em);
+       }
+       read_unlock(&BTRFS_I(inode)->extent_tree.lock);
        btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 
        while (disk_num_bytes > 0) {
@@ -737,7 +747,6 @@ static noinline int cow_file_range(struct inode *inode,
                em = alloc_extent_map(GFP_NOFS);
                em->start = start;
                em->orig_start = em->start;
-
                ram_size = ins.offset;
                em->len = ins.offset;
 
@@ -747,9 +756,9 @@ static noinline int cow_file_range(struct inode *inode,
                set_bit(EXTENT_FLAG_PINNED, &em->flags);
 
                while (1) {
-                       spin_lock(&em_tree->lock);
+                       write_lock(&em_tree->lock);
                        ret = add_extent_mapping(em_tree, em);
-                       spin_unlock(&em_tree->lock);
+                       write_unlock(&em_tree->lock);
                        if (ret != -EEXIST) {
                                free_extent_map(em);
                                break;
@@ -776,11 +785,14 @@ static noinline int cow_file_range(struct inode *inode,
                /* we're not doing compressed IO, don't unlock the first
                 * page (which the caller expects to stay locked), don't
                 * clear any dirty bits and don't set any writeback bits
+                *
+                * Do set the Private2 bit so we know this page was properly
+                * setup for writepage
                 */
                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
                                             start, start + ram_size - 1,
                                             locked_page, unlock, 1,
-                                            1, 0, 0, 0);
+                                            1, 0, 0, 0, 1);
                disk_num_bytes -= cur_alloc_size;
                num_bytes -= cur_alloc_size;
                alloc_hint = ins.objectid + ins.offset;
@@ -853,7 +865,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
        int limit = 10 * 1024 * 1042;
 
        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
-                        EXTENT_DELALLOC, 1, 0, GFP_NOFS);
+                        EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS);
        while (start < end) {
                async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
                async_cow->inode = inode;
@@ -1080,9 +1092,9 @@ out_check:
                        em->bdev = root->fs_info->fs_devices->latest_bdev;
                        set_bit(EXTENT_FLAG_PINNED, &em->flags);
                        while (1) {
-                               spin_lock(&em_tree->lock);
+                               write_lock(&em_tree->lock);
                                ret = add_extent_mapping(em_tree, em);
-                               spin_unlock(&em_tree->lock);
+                               write_unlock(&em_tree->lock);
                                if (ret != -EEXIST) {
                                        free_extent_map(em);
                                        break;
@@ -1101,7 +1113,7 @@ out_check:
 
                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
                                        cur_offset, cur_offset + num_bytes - 1,
-                                       locked_page, 1, 1, 1, 0, 0, 0);
+                                       locked_page, 1, 1, 1, 0, 0, 0, 1);
                cur_offset = extent_end;
                if (cur_offset > end)
                        break;
@@ -1374,10 +1386,8 @@ again:
        lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
 
        /* already ordered? We're done */
-       if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-                            EXTENT_ORDERED, 0)) {
+       if (PagePrivate2(page))
                goto out;
-       }
 
        ordered = btrfs_lookup_ordered_extent(inode, page_start);
        if (ordered) {
@@ -1413,11 +1423,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
        struct inode *inode = page->mapping->host;
        struct btrfs_writepage_fixup *fixup;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       int ret;
 
-       ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
-                            EXTENT_ORDERED, 0);
-       if (ret)
+       /* this page is properly in the ordered list */
+       if (TestClearPagePrivate2(page))
                return 0;
 
        if (PageChecked(page))
@@ -1455,9 +1463,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        BUG_ON(!path);
 
        path->leave_spinning = 1;
+
+       /*
+        * we may be replacing one extent in the tree with another.
+        * The new extent is pinned in the extent map, and we don't want
+        * to drop it from the cache until it is completely in the btree.
+        *
+        * So, tell btrfs_drop_extents to leave this extent in the cache.
+        * the caller is expected to unpin it and allow it to be merged
+        * with the others.
+        */
        ret = btrfs_drop_extents(trans, root, inode, file_pos,
                                 file_pos + num_bytes, locked_end,
-                                file_pos, &hint);
+                                file_pos, &hint, 0);
        BUG_ON(ret);
 
        ins.objectid = inode->i_ino;
@@ -1485,7 +1503,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
 
        inode_add_bytes(inode, num_bytes);
-       btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
 
        ins.objectid = disk_bytenr;
        ins.offset = disk_num_bytes;
@@ -1596,6 +1613,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                                ordered_extent->len,
                                                compressed, 0, 0,
                                                BTRFS_FILE_EXTENT_REG);
+               unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+                                  ordered_extent->file_offset,
+                                  ordered_extent->len);
                BUG_ON(ret);
        }
        unlock_extent(io_tree, ordered_extent->file_offset,
@@ -1623,6 +1643,7 @@ nocow:
 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
                                struct extent_state *state, int uptodate)
 {
+       ClearPagePrivate2(page);
        return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }
 
@@ -1669,13 +1690,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
                failrec->last_mirror = 0;
                failrec->bio_flags = 0;
 
-               spin_lock(&em_tree->lock);
+               read_lock(&em_tree->lock);
                em = lookup_extent_mapping(em_tree, start, failrec->len);
                if (em->start > start || em->start + em->len < start) {
                        free_extent_map(em);
                        em = NULL;
                }
-               spin_unlock(&em_tree->lock);
+               read_unlock(&em_tree->lock);
 
                if (!em || IS_ERR(em)) {
                        kfree(failrec);
@@ -1794,7 +1815,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                return 0;
 
        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
-           test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
+           test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
                clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
                                  GFP_NOFS);
                return 0;
@@ -2352,6 +2373,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        return ret;
 }
 
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root,
+                       struct inode *dir, u64 objectid,
+                       const char *name, int name_len)
+{
+       struct btrfs_path *path;
+       struct extent_buffer *leaf;
+       struct btrfs_dir_item *di;
+       struct btrfs_key key;
+       u64 index;
+       int ret;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+                                  name, name_len, -1);
+       BUG_ON(!di || IS_ERR(di));
+
+       leaf = path->nodes[0];
+       btrfs_dir_item_key_to_cpu(leaf, di, &key);
+       WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+       ret = btrfs_delete_one_dir_name(trans, root, path, di);
+       BUG_ON(ret);
+       btrfs_release_path(root, path);
+
+       ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
+                                objectid, root->root_key.objectid,
+                                dir->i_ino, &index, name, name_len);
+       if (ret < 0) {
+               BUG_ON(ret != -ENOENT);
+               di = btrfs_search_dir_index_item(root, path, dir->i_ino,
+                                                name, name_len);
+               BUG_ON(!di || IS_ERR(di));
+
+               leaf = path->nodes[0];
+               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+               btrfs_release_path(root, path);
+               index = key.offset;
+       }
+
+       di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+                                        index, name, name_len, -1);
+       BUG_ON(!di || IS_ERR(di));
+
+       leaf = path->nodes[0];
+       btrfs_dir_item_key_to_cpu(leaf, di, &key);
+       WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+       ret = btrfs_delete_one_dir_name(trans, root, path, di);
+       BUG_ON(ret);
+       btrfs_release_path(root, path);
+
+       btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+       dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+       ret = btrfs_update_inode(trans, root, dir);
+       BUG_ON(ret);
+       dir->i_sb->s_dirt = 1;
+
+       btrfs_free_path(path);
+       return 0;
+}
+
 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
@@ -2361,29 +2445,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        struct btrfs_trans_handle *trans;
        unsigned long nr = 0;
 
-       /*
-        * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
-        * the root of a subvolume or snapshot
-        */
        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
-           inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+           inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                return -ENOTEMPTY;
-       }
 
        trans = btrfs_start_transaction(root, 1);
        btrfs_set_trans_block_group(trans, dir);
 
+       if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+               err = btrfs_unlink_subvol(trans, root, dir,
+                                         BTRFS_I(inode)->location.objectid,
+                                         dentry->d_name.name,
+                                         dentry->d_name.len);
+               goto out;
+       }
+
        err = btrfs_orphan_add(trans, inode);
        if (err)
-               goto fail_trans;
+               goto out;
 
        /* now the directory is empty */
        err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
                                 dentry->d_name.name, dentry->d_name.len);
        if (!err)
                btrfs_i_size_write(inode, 0);
-
-fail_trans:
+out:
        nr = trans->blocks_used;
        ret = btrfs_end_transaction_throttle(trans, root);
        btrfs_btree_balance_dirty(root, nr);
@@ -2935,7 +3021,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                                                 cur_offset,
                                                 cur_offset + hole_size,
                                                 block_end,
-                                                cur_offset, &hint_byte);
+                                                cur_offset, &hint_byte, 1);
                        if (err)
                                break;
                        err = btrfs_insert_file_extent(trans, root,
@@ -3003,6 +3089,11 @@ void btrfs_delete_inode(struct inode *inode)
        }
        btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
+       if (inode->i_nlink > 0) {
+               BUG_ON(btrfs_root_refs(&root->root_item) != 0);
+               goto no_delete;
+       }
+
        btrfs_i_size_write(inode, 0);
        trans = btrfs_join_transaction(root, 1);
 
@@ -3070,29 +3161,67 @@ out_err:
  * is kind of like crossing a mount point.
  */
 static int fixup_tree_root_location(struct btrfs_root *root,
-                            struct btrfs_key *location,
-                            struct btrfs_root **sub_root,
-                            struct dentry *dentry)
+                                   struct inode *dir,
+                                   struct dentry *dentry,
+                                   struct btrfs_key *location,
+                                   struct btrfs_root **sub_root)
 {
-       struct btrfs_root_item *ri;
+       struct btrfs_path *path;
+       struct btrfs_root *new_root;
+       struct btrfs_root_ref *ref;
+       struct extent_buffer *leaf;
+       int ret;
+       int err = 0;
 
-       if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
-               return 0;
-       if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
-               return 0;
+       path = btrfs_alloc_path();
+       if (!path) {
+               err = -ENOMEM;
+               goto out;
+       }
 
-       *sub_root = btrfs_read_fs_root(root->fs_info, location,
-                                       dentry->d_name.name,
-                                       dentry->d_name.len);
-       if (IS_ERR(*sub_root))
-               return PTR_ERR(*sub_root);
+       err = -ENOENT;
+       ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
+                                 BTRFS_I(dir)->root->root_key.objectid,
+                                 location->objectid);
+       if (ret) {
+               if (ret < 0)
+                       err = ret;
+               goto out;
+       }
 
-       ri = &(*sub_root)->root_item;
-       location->objectid = btrfs_root_dirid(ri);
-       btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
-       location->offset = 0;
+       leaf = path->nodes[0];
+       ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+       if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
+           btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
+               goto out;
 
-       return 0;
+       ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
+                                  (unsigned long)(ref + 1),
+                                  dentry->d_name.len);
+       if (ret)
+               goto out;
+
+       btrfs_release_path(root->fs_info->tree_root, path);
+
+       new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
+       if (IS_ERR(new_root)) {
+               err = PTR_ERR(new_root);
+               goto out;
+       }
+
+       if (btrfs_root_refs(&new_root->root_item) == 0) {
+               err = -ENOENT;
+               goto out;
+       }
+
+       *sub_root = new_root;
+       location->objectid = btrfs_root_dirid(&new_root->root_item);
+       location->type = BTRFS_INODE_ITEM_KEY;
+       location->offset = 0;
+       err = 0;
+out:
+       btrfs_free_path(path);
+       return err;
 }
 
 static void inode_tree_add(struct inode *inode)
@@ -3101,11 +3230,13 @@ static void inode_tree_add(struct inode *inode)
        struct btrfs_inode *entry;
        struct rb_node **p;
        struct rb_node *parent;
-
 again:
        p = &root->inode_tree.rb_node;
        parent = NULL;
 
+       if (hlist_unhashed(&inode->i_hash))
+               return;
+
        spin_lock(&root->inode_lock);
        while (*p) {
                parent = *p;
@@ -3132,13 +3263,87 @@ again:
 static void inode_tree_del(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+       int empty = 0;
 
        spin_lock(&root->inode_lock);
        if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
                rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
                RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+               empty = RB_EMPTY_ROOT(&root->inode_tree);
        }
        spin_unlock(&root->inode_lock);
+
+       if (empty && btrfs_root_refs(&root->root_item) == 0) {
+               synchronize_srcu(&root->fs_info->subvol_srcu);
+               spin_lock(&root->inode_lock);
+               empty = RB_EMPTY_ROOT(&root->inode_tree);
+               spin_unlock(&root->inode_lock);
+               if (empty)
+                       btrfs_add_dead_root(root);
+       }
+}
+
+int btrfs_invalidate_inodes(struct btrfs_root *root)
+{
+       struct rb_node *node;
+       struct rb_node *prev;
+       struct btrfs_inode *entry;
+       struct inode *inode;
+       u64 objectid = 0;
+
+       WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+
+       spin_lock(&root->inode_lock);
+again:
+       node = root->inode_tree.rb_node;
+       prev = NULL;
+       while (node) {
+               prev = node;
+               entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+               if (objectid < entry->vfs_inode.i_ino)
+                       node = node->rb_left;
+               else if (objectid > entry->vfs_inode.i_ino)
+                       node = node->rb_right;
+               else
+                       break;
+       }
+       if (!node) {
+               while (prev) {
+                       entry = rb_entry(prev, struct btrfs_inode, rb_node);
+                       if (objectid <= entry->vfs_inode.i_ino) {
+                               node = prev;
+                               break;
+                       }
+                       prev = rb_next(prev);
+               }
+       }
+       while (node) {
+               entry = rb_entry(node, struct btrfs_inode, rb_node);
+               objectid = entry->vfs_inode.i_ino + 1;
+               inode = igrab(&entry->vfs_inode);
+               if (inode) {
+                       spin_unlock(&root->inode_lock);
+                       if (atomic_read(&inode->i_count) > 1)
+                               d_prune_aliases(inode);
+                       /*
+                        * btrfs_drop_inode will remove it from
+                        * the inode cache when its usage count
+                        * hits zero.
+                        */
+                       iput(inode);
+                       cond_resched();
+                       spin_lock(&root->inode_lock);
+                       goto again;
+               }
+
+               if (cond_resched_lock(&root->inode_lock))
+                       goto again;
+
+               node = rb_next(node);
+       }
+       spin_unlock(&root->inode_lock);
+       return 0;
 }
 
 static noinline void init_btrfs_i(struct inode *inode)
@@ -3225,15 +3430,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
        return inode;
 }
 
+static struct inode *new_simple_dir(struct super_block *s,
+                                   struct btrfs_key *key,
+                                   struct btrfs_root *root)
+{
+       struct inode *inode = new_inode(s);
+
+       if (!inode)
+               return ERR_PTR(-ENOMEM);
+
+       init_btrfs_i(inode);
+
+       BTRFS_I(inode)->root = root;
+       memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
+       BTRFS_I(inode)->dummy_inode = 1;
+
+       inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
+       inode->i_op = &simple_dir_inode_operations;
+       inode->i_fop = &simple_dir_operations;
+       inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+       inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+       return inode;
+}
+
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode;
-       struct btrfs_inode *bi = BTRFS_I(dir);
-       struct btrfs_root *root = bi->root;
+       struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_root *sub_root = root;
        struct btrfs_key location;
+       int index;
        int ret;
 
+       dentry->d_op = &btrfs_dentry_operations;
+
        if (dentry->d_name.len > BTRFS_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
 
@@ -3242,29 +3473,50 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        if (ret < 0)
                return ERR_PTR(ret);
 
-       inode = NULL;
-       if (location.objectid) {
-               ret = fixup_tree_root_location(root, &location, &sub_root,
-                                               dentry);
-               if (ret < 0)
-                       return ERR_PTR(ret);
-               if (ret > 0)
-                       return ERR_PTR(-ENOENT);
+       if (location.objectid == 0)
+               return NULL;
+
+       if (location.type == BTRFS_INODE_ITEM_KEY) {
+               inode = btrfs_iget(dir->i_sb, &location, root);
+               return inode;
+       }
+
+       BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
+
+       index = srcu_read_lock(&root->fs_info->subvol_srcu);
+       ret = fixup_tree_root_location(root, dir, dentry,
+                                      &location, &sub_root);
+       if (ret < 0) {
+               if (ret != -ENOENT)
+                       inode = ERR_PTR(ret);
+               else
+                       inode = new_simple_dir(dir->i_sb, &location, sub_root);
+       } else {
                inode = btrfs_iget(dir->i_sb, &location, sub_root);
-               if (IS_ERR(inode))
-                       return ERR_CAST(inode);
        }
+       srcu_read_unlock(&root->fs_info->subvol_srcu, index);
+
        return inode;
 }
 
+static int btrfs_dentry_delete(struct dentry *dentry)
+{
+       struct btrfs_root *root;
+
+       if (!dentry->d_inode)
+               return 0;
+
+       root = BTRFS_I(dentry->d_inode)->root;
+       if (btrfs_root_refs(&root->root_item) == 0)
+               return 1;
+       return 0;
+}
+
 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
                                   struct nameidata *nd)
 {
        struct inode *inode;
 
-       if (dentry->d_name.len > BTRFS_NAME_LEN)
-               return ERR_PTR(-ENAMETOOLONG);
-
        inode = btrfs_lookup_dentry(dir, dentry);
        if (IS_ERR(inode))
                return ERR_CAST(inode);
@@ -3603,9 +3855,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        if (ret != 0)
                goto fail;
 
-       if (objectid > root->highest_inode)
-               root->highest_inode = objectid;
-
        inode->i_uid = current_fsuid();
 
        if (dir && (dir->i_mode & S_ISGID)) {
@@ -3673,26 +3922,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
                   struct inode *parent_inode, struct inode *inode,
                   const char *name, int name_len, int add_backref, u64 index)
 {
-       int ret;
+       int ret = 0;
        struct btrfs_key key;
        struct btrfs_root *root = BTRFS_I(parent_inode)->root;
 
-       key.objectid = inode->i_ino;
-       btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-       key.offset = 0;
+       if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+               memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
+       } else {
+               key.objectid = inode->i_ino;
+               btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+               key.offset = 0;
+       }
+
+       if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+               ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+                                        key.objectid, root->root_key.objectid,
+                                        parent_inode->i_ino,
+                                        index, name, name_len);
+       } else if (add_backref) {
+               ret = btrfs_insert_inode_ref(trans, root,
+                                            name, name_len, inode->i_ino,
+                                            parent_inode->i_ino, index);
+       }
 
-       ret = btrfs_insert_dir_item(trans, root, name, name_len,
-                                   parent_inode->i_ino,
-                                   &key, btrfs_inode_type(inode),
-                                   index);
        if (ret == 0) {
-               if (add_backref) {
-                       ret = btrfs_insert_inode_ref(trans, root,
-                                                    name, name_len,
-                                                    inode->i_ino,
-                                                    parent_inode->i_ino,
-                                                    index);
-               }
+               ret = btrfs_insert_dir_item(trans, root, name, name_len,
+                                           parent_inode->i_ino, &key,
+                                           btrfs_inode_type(inode), index);
+               BUG_ON(ret);
+
                btrfs_i_size_write(parent_inode, parent_inode->i_size +
                                   name_len * 2);
                parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
@@ -3875,18 +4133,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 
        err = btrfs_add_nondir(trans, dentry, inode, 1, index);
 
-       if (err)
-               drop_inode = 1;
-
-       btrfs_update_inode_block_group(trans, dir);
-       err = btrfs_update_inode(trans, root, inode);
-
-       if (err)
+       if (err) {
                drop_inode = 1;
+       } else {
+               btrfs_update_inode_block_group(trans, dir);
+               err = btrfs_update_inode(trans, root, inode);
+               BUG_ON(err);
+               btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
+       }
 
        nr = trans->blocks_used;
-
-       btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
        btrfs_end_transaction_throttle(trans, root);
 fail:
        if (drop_inode) {
@@ -4064,11 +4320,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
        int compressed;
 
 again:
-       spin_lock(&em_tree->lock);
+       read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, start, len);
        if (em)
                em->bdev = root->fs_info->fs_devices->latest_bdev;
-       spin_unlock(&em_tree->lock);
+       read_unlock(&em_tree->lock);
 
        if (em) {
                if (em->start > start || em->start + em->len <= start)
@@ -4215,6 +4471,11 @@ again:
                                map = kmap(page);
                                read_extent_buffer(leaf, map + pg_offset, ptr,
                                                   copy_size);
+                               if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
+                                       memset(map + pg_offset + copy_size, 0,
+                                              PAGE_CACHE_SIZE - pg_offset -
+                                              copy_size);
+                               }
                                kunmap(page);
                        }
                        flush_dcache_page(page);
@@ -4259,7 +4520,7 @@ insert:
        }
 
        err = 0;
-       spin_lock(&em_tree->lock);
+       write_lock(&em_tree->lock);
        ret = add_extent_mapping(em_tree, em);
        /* it is possible that someone inserted the extent into the tree
         * while we had the lock dropped.  It is also possible that
@@ -4299,7 +4560,7 @@ insert:
                        err = 0;
                }
        }
-       spin_unlock(&em_tree->lock);
+       write_unlock(&em_tree->lock);
 out:
        if (path)
                btrfs_free_path(path);
@@ -4398,13 +4659,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
        u64 page_start = page_offset(page);
        u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
+
+       /*
+        * we have the page locked, so new writeback can't start,
+        * and the dirty bit won't be cleared while we are here.
+        *
+        * Wait for IO on this page so that we can safely clear
+        * the PagePrivate2 bit and do ordered accounting
+        */
        wait_on_page_writeback(page);
+
        tree = &BTRFS_I(page->mapping->host)->io_tree;
        if (offset) {
                btrfs_releasepage(page, GFP_NOFS);
                return;
        }
-
        lock_extent(tree, page_start, page_end, GFP_NOFS);
        ordered = btrfs_lookup_ordered_extent(page->mapping->host,
                                           page_offset(page));
@@ -4415,16 +4684,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
                 */
                clear_extent_bit(tree, page_start, page_end,
                                 EXTENT_DIRTY | EXTENT_DELALLOC |
-                                EXTENT_LOCKED, 1, 0, GFP_NOFS);
-               btrfs_finish_ordered_io(page->mapping->host,
-                                       page_start, page_end);
+                                EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
+               /*
+                * whoever cleared the private bit is responsible
+                * for the finish_ordered_io
+                */
+               if (TestClearPagePrivate2(page)) {
+                       btrfs_finish_ordered_io(page->mapping->host,
+                                               page_start, page_end);
+               }
                btrfs_put_ordered_extent(ordered);
                lock_extent(tree, page_start, page_end, GFP_NOFS);
        }
        clear_extent_bit(tree, page_start, page_end,
-                EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-                EXTENT_ORDERED,
-                1, 1, GFP_NOFS);
+                EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
+                1, 1, NULL, GFP_NOFS);
        __btrfs_releasepage(page, GFP_NOFS);
 
        ClearPageChecked(page);
@@ -4521,11 +4795,14 @@ again:
        }
        ClearPageChecked(page);
        set_page_dirty(page);
+       SetPageUptodate(page);
 
        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 out_unlock:
+       if (!ret)
+               return VM_FAULT_LOCKED;
        unlock_page(page);
 out:
        return ret;
@@ -4594,11 +4871,11 @@ out:
  * create a new subvolume directory/inode (helper for the ioctl).
  */
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *new_root, struct dentry *dentry,
+                            struct btrfs_root *new_root,
                             u64 new_dirid, u64 alloc_hint)
 {
        struct inode *inode;
-       int error;
+       int err;
        u64 index = 0;
 
        inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
@@ -4611,11 +4888,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
        inode->i_nlink = 1;
        btrfs_i_size_write(inode, 0);
 
-       error = btrfs_update_inode(trans, new_root, inode);
-       if (error)
-               return error;
+       err = btrfs_update_inode(trans, new_root, inode);
+       BUG_ON(err);
 
-       d_instantiate(dentry, inode);
+       iput(inode);
        return 0;
 }
 
@@ -4693,6 +4969,16 @@ void btrfs_destroy_inode(struct inode *inode)
        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
+void btrfs_drop_inode(struct inode *inode)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+
+       if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
+               generic_delete_inode(inode);
+       else
+               generic_drop_inode(inode);
+}
+
 static void init_once(void *foo)
 {
        struct btrfs_inode *ei = (struct btrfs_inode *) foo;
@@ -4761,31 +5047,32 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(old_dir)->root;
+       struct btrfs_root *dest = BTRFS_I(new_dir)->root;
        struct inode *new_inode = new_dentry->d_inode;
        struct inode *old_inode = old_dentry->d_inode;
        struct timespec ctime = CURRENT_TIME;
        u64 index = 0;
+       u64 root_objectid;
        int ret;
 
-       /* we're not allowed to rename between subvolumes */
-       if (BTRFS_I(old_inode)->root->root_key.objectid !=
-           BTRFS_I(new_dir)->root->root_key.objectid)
+       if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+               return -EPERM;
+
+       /* we only allow rename subvolume link between subvolumes */
+       if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
                return -EXDEV;
 
-       if (S_ISDIR(old_inode->i_mode) && new_inode &&
-           new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
+       if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
+           (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
                return -ENOTEMPTY;
-       }
 
-       /* to rename a snapshot or subvolume, we need to juggle the
-        * backrefs.  This isn't coded yet
-        */
-       if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
-               return -EXDEV;
+       if (S_ISDIR(old_inode->i_mode) && new_inode &&
+           new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+               return -ENOTEMPTY;
 
        ret = btrfs_check_metadata_free_space(root);
        if (ret)
-               goto out_unlock;
+               return ret;
 
        /*
         * we're using rename to replace one file with another.
@@ -4796,8 +5083,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
            old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
                filemap_flush(old_inode->i_mapping);
 
+       /* close the racy window with snapshot create/destroy ioctl */
+       if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+               down_read(&root->fs_info->subvol_sem);
+
        trans = btrfs_start_transaction(root, 1);
+       btrfs_set_trans_block_group(trans, new_dir);
+
+       if (dest != root)
+               btrfs_record_root_in_trans(trans, dest);
 
+       ret = btrfs_set_inode_index(new_dir, &index);
+       if (ret)
+               goto out_fail;
+
+       if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+               /* force full log commit if subvolume involved. */
+               root->fs_info->last_trans_log_full_commit = trans->transid;
+       } else {
+               ret = btrfs_insert_inode_ref(trans, dest,
+                                            new_dentry->d_name.name,
+                                            new_dentry->d_name.len,
+                                            old_inode->i_ino,
+                                            new_dir->i_ino, index);
+               if (ret)
+                       goto out_fail;
+               /*
+                * this is an ugly little race, but the rename is required
+                * to make sure that if we crash, the inode is either at the
+                * old name or the new one.  pinning the log transaction lets
+                * us make sure we don't allow a log commit to come in after
+                * we unlink the name but before we add the new name back in.
+                */
+               btrfs_pin_log_trans(root);
+       }
        /*
         * make sure the inode gets flushed if it is replacing
         * something.
@@ -4807,18 +5126,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                btrfs_add_ordered_operation(trans, root, old_inode);
        }
 
-       /*
-        * this is an ugly little race, but the rename is required to make
-        * sure that if we crash, the inode is either at the old name
-        * or the new one.  pinning the log transaction lets us make sure
-        * we don't allow a log commit to come in after we unlink the
-        * name but before we add the new name back in.
-        */
-       btrfs_pin_log_trans(root);
-
-       btrfs_set_trans_block_group(trans, new_dir);
-
-       btrfs_inc_nlink(old_dentry->d_inode);
        old_dir->i_ctime = old_dir->i_mtime = ctime;
        new_dir->i_ctime = new_dir->i_mtime = ctime;
        old_inode->i_ctime = ctime;
@@ -4826,47 +5133,58 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (old_dentry->d_parent != new_dentry->d_parent)
                btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
 
-       ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
-                                old_dentry->d_name.name,
-                                old_dentry->d_name.len);
-       if (ret)
-               goto out_fail;
+       if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+               root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+               ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
+                                       old_dentry->d_name.name,
+                                       old_dentry->d_name.len);
+       } else {
+               btrfs_inc_nlink(old_dentry->d_inode);
+               ret = btrfs_unlink_inode(trans, root, old_dir,
+                                        old_dentry->d_inode,
+                                        old_dentry->d_name.name,
+                                        old_dentry->d_name.len);
+       }
+       BUG_ON(ret);
 
        if (new_inode) {
                new_inode->i_ctime = CURRENT_TIME;
-               ret = btrfs_unlink_inode(trans, root, new_dir,
-                                        new_dentry->d_inode,
-                                        new_dentry->d_name.name,
-                                        new_dentry->d_name.len);
-               if (ret)
-                       goto out_fail;
+               if (unlikely(new_inode->i_ino ==
+                            BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+                       root_objectid = BTRFS_I(new_inode)->location.objectid;
+                       ret = btrfs_unlink_subvol(trans, dest, new_dir,
+                                               root_objectid,
+                                               new_dentry->d_name.name,
+                                               new_dentry->d_name.len);
+                       BUG_ON(new_inode->i_nlink == 0);
+               } else {
+                       ret = btrfs_unlink_inode(trans, dest, new_dir,
+                                                new_dentry->d_inode,
+                                                new_dentry->d_name.name,
+                                                new_dentry->d_name.len);
+               }
+               BUG_ON(ret);
                if (new_inode->i_nlink == 0) {
                        ret = btrfs_orphan_add(trans, new_dentry->d_inode);
-                       if (ret)
-                               goto out_fail;
+                       BUG_ON(ret);
                }
-
        }
-       ret = btrfs_set_inode_index(new_dir, &index);
-       if (ret)
-               goto out_fail;
 
-       ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
-                            old_inode, new_dentry->d_name.name,
-                            new_dentry->d_name.len, 1, index);
-       if (ret)
-               goto out_fail;
+       ret = btrfs_add_link(trans, new_dir, old_inode,
+                            new_dentry->d_name.name,
+                            new_dentry->d_name.len, 0, index);
+       BUG_ON(ret);
 
-       btrfs_log_new_name(trans, old_inode, old_dir,
-                                      new_dentry->d_parent);
+       if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+               btrfs_log_new_name(trans, old_inode, old_dir,
+                                  new_dentry->d_parent);
+               btrfs_end_log_trans(root);
+       }
 out_fail:
-
-       /* this btrfs_end_log_trans just allows the current
-        * log-sub transaction to complete
-        */
-       btrfs_end_log_trans(root);
        btrfs_end_transaction_throttle(trans, root);
-out_unlock:
+
+       if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+               up_read(&root->fs_info->subvol_sem);
        return ret;
 }
 
@@ -5058,6 +5376,8 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
                                                  0, 0, 0,
                                                  BTRFS_FILE_EXTENT_PREALLOC);
                BUG_ON(ret);
+               btrfs_drop_extent_cache(inode, cur_offset,
+                                       cur_offset + ins.offset -1, 0);
                num_bytes -= ins.offset;
                cur_offset += ins.offset;
                alloc_hint = ins.objectid + ins.offset;
@@ -5223,6 +5543,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
        .lookup         = btrfs_lookup,
        .permission     = btrfs_permission,
 };
+
 static struct file_operations btrfs_dir_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
@@ -5309,3 +5630,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
 };
+
+struct dentry_operations btrfs_dentry_operations = {
+       .d_delete       = btrfs_dentry_delete,
+};
index bd88f25889f7c5daf94bf0aec8645041bfb2ce60..a8577a7f26ab248984357a0b8b6505013cff3dba 100644 (file)
@@ -230,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root,
        struct btrfs_root_item root_item;
        struct btrfs_inode_item *inode_item;
        struct extent_buffer *leaf;
-       struct btrfs_root *new_root = root;
-       struct inode *dir;
+       struct btrfs_root *new_root;
+       struct inode *dir = dentry->d_parent->d_inode;
        int ret;
        int err;
        u64 objectid;
@@ -241,7 +241,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 
        ret = btrfs_check_metadata_free_space(root);
        if (ret)
-               goto fail_commit;
+               return ret;
 
        trans = btrfs_start_transaction(root, 1);
        BUG_ON(!trans);
@@ -304,11 +304,17 @@ static noinline int create_subvol(struct btrfs_root *root,
        if (ret)
                goto fail;
 
+       key.offset = (u64)-1;
+       new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+       BUG_ON(IS_ERR(new_root));
+
+       btrfs_record_root_in_trans(trans, new_root);
+
+       ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
+                                      BTRFS_I(dir)->block_group);
        /*
         * insert the directory item
         */
-       key.offset = (u64)-1;
-       dir = dentry->d_parent->d_inode;
        ret = btrfs_set_inode_index(dir, &index);
        BUG_ON(ret);
 
@@ -322,44 +328,18 @@ static noinline int create_subvol(struct btrfs_root *root,
        ret = btrfs_update_inode(trans, root, dir);
        BUG_ON(ret);
 
-       /* add the backref first */
        ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
-                                objectid, BTRFS_ROOT_BACKREF_KEY,
-                                root->root_key.objectid,
+                                objectid, root->root_key.objectid,
                                 dir->i_ino, index, name, namelen);
 
        BUG_ON(ret);
 
-       /* now add the forward ref */
-       ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
-                                root->root_key.objectid, BTRFS_ROOT_REF_KEY,
-                                objectid,
-                                dir->i_ino, index, name, namelen);
-
-       BUG_ON(ret);
-
-       ret = btrfs_commit_transaction(trans, root);
-       if (ret)
-               goto fail_commit;
-
-       new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
-       BUG_ON(!new_root);
-
-       trans = btrfs_start_transaction(new_root, 1);
-       BUG_ON(!trans);
-
-       ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
-                                      BTRFS_I(dir)->block_group);
-       if (ret)
-               goto fail;
-
+       d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
 fail:
        nr = trans->blocks_used;
-       err = btrfs_commit_transaction(trans, new_root);
+       err = btrfs_commit_transaction(trans, root);
        if (err && !ret)
                ret = err;
-fail_commit:
-       btrfs_btree_balance_dirty(root, nr);
        return ret;
 }
 
@@ -420,14 +400,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
  * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
  * inside this filesystem so it's quite a bit simpler.
  */
-static noinline int btrfs_mksubvol(struct path *parent, char *name,
-                                  int mode, int namelen,
+static noinline int btrfs_mksubvol(struct path *parent,
+                                  char *name, int namelen,
                                   struct btrfs_root *snap_src)
 {
+       struct inode *dir  = parent->dentry->d_inode;
        struct dentry *dentry;
        int error;
 
-       mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+       mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
 
        dentry = lookup_one_len(name, parent->dentry, namelen);
        error = PTR_ERR(dentry);
@@ -438,99 +419,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
        if (dentry->d_inode)
                goto out_dput;
 
-       if (!IS_POSIXACL(parent->dentry->d_inode))
-               mode &= ~current_umask();
-
        error = mnt_want_write(parent->mnt);
        if (error)
                goto out_dput;
 
-       error = btrfs_may_create(parent->dentry->d_inode, dentry);
+       error = btrfs_may_create(dir, dentry);
        if (error)
                goto out_drop_write;
 
-       /*
-        * Actually perform the low-level subvolume creation after all
-        * this VFS fuzz.
-        *
-        * Eventually we want to pass in an inode under which we create this
-        * subvolume, but for now all are under the filesystem root.
-        *
-        * Also we should pass on the mode eventually to allow creating new
-        * subvolume with specific mode bits.
-        */
+       down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+
+       if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
+               goto out_up_read;
+
        if (snap_src) {
-               struct dentry *dir = dentry->d_parent;
-               struct dentry *test = dir->d_parent;
-               struct btrfs_path *path = btrfs_alloc_path();
-               int ret;
-               u64 test_oid;
-               u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
-
-               test_oid = snap_src->root_key.objectid;
-
-               ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
-                                         path, parent_oid, test_oid);
-               if (ret == 0)
-                       goto create;
-               btrfs_release_path(snap_src->fs_info->tree_root, path);
-
-               /* we need to make sure we aren't creating a directory loop
-                * by taking a snapshot of something that has our current
-                * subvol in its directory tree.  So, this loops through
-                * the dentries and checks the forward refs for each subvolume
-                * to see if is references the subvolume where we are
-                * placing this new snapshot.
-                */
-               while (1) {
-                       if (!test ||
-                           dir == snap_src->fs_info->sb->s_root ||
-                           test == snap_src->fs_info->sb->s_root ||
-                           test->d_inode->i_sb != snap_src->fs_info->sb) {
-                               break;
-                       }
-                       if (S_ISLNK(test->d_inode->i_mode)) {
-                               printk(KERN_INFO "Btrfs symlink in snapshot "
-                                      "path, failed\n");
-                               error = -EMLINK;
-                               btrfs_free_path(path);
-                               goto out_drop_write;
-                       }
-                       test_oid =
-                               BTRFS_I(test->d_inode)->root->root_key.objectid;
-                       ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
-                                 path, test_oid, parent_oid);
-                       if (ret == 0) {
-                               printk(KERN_INFO "Btrfs snapshot creation "
-                                      "failed, looping\n");
-                               error = -EMLINK;
-                               btrfs_free_path(path);
-                               goto out_drop_write;
-                       }
-                       btrfs_release_path(snap_src->fs_info->tree_root, path);
-                       test = test->d_parent;
-               }
-create:
-               btrfs_free_path(path);
-               error = create_snapshot(snap_src, dentry, name, namelen);
+               error = create_snapshot(snap_src, dentry,
+                                       name, namelen);
        } else {
-               error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
-                                     dentry, name, namelen);
+               error = create_subvol(BTRFS_I(dir)->root, dentry,
+                                     name, namelen);
        }
-       if (error)
-               goto out_drop_write;
-
-       fsnotify_mkdir(parent->dentry->d_inode, dentry);
+       if (!error)
+               fsnotify_mkdir(dir, dentry);
+out_up_read:
+       up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
 out_drop_write:
        mnt_drop_write(parent->mnt);
 out_dput:
        dput(dentry);
 out_unlock:
-       mutex_unlock(&parent->dentry->d_inode->i_mutex);
+       mutex_unlock(&dir->i_mutex);
        return error;
 }
 
-
 static int btrfs_defrag_file(struct file *file)
 {
        struct inode *inode = fdentry(file)->d_inode;
@@ -596,9 +517,8 @@ again:
                clear_page_dirty_for_io(page);
 
                btrfs_set_extent_delalloc(inode, page_start, page_end);
-
-               unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
                set_page_dirty(page);
+               unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
                unlock_page(page);
                page_cache_release(page);
                balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
@@ -609,7 +529,8 @@ out_unlock:
        return 0;
 }
 
-static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
+static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
+                                       void __user *arg)
 {
        u64 new_size;
        u64 old_size;
@@ -718,10 +639,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 {
        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_vol_args *vol_args;
-       struct btrfs_dir_item *di;
-       struct btrfs_path *path;
        struct file *src_file;
-       u64 root_dirid;
        int namelen;
        int ret = 0;
 
@@ -739,32 +657,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
                goto out;
        }
 
-       path = btrfs_alloc_path();
-       if (!path) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
-       di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
-                           path, root_dirid,
-                           vol_args->name, namelen, 0);
-       btrfs_free_path(path);
-
-       if (di && !IS_ERR(di)) {
-               ret = -EEXIST;
-               goto out;
-       }
-
-       if (IS_ERR(di)) {
-               ret = PTR_ERR(di);
-               goto out;
-       }
-
        if (subvol) {
-               ret = btrfs_mksubvol(&file->f_path, vol_args->name,
-                                    file->f_path.dentry->d_inode->i_mode,
-                                    namelen, NULL);
+               ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
+                                    NULL);
        } else {
                struct inode *src_inode;
                src_file = fget(vol_args->fd);
@@ -781,17 +676,156 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
                        fput(src_file);
                        goto out;
                }
-               ret = btrfs_mksubvol(&file->f_path, vol_args->name,
-                            file->f_path.dentry->d_inode->i_mode,
-                            namelen, BTRFS_I(src_inode)->root);
+               ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
+                                    BTRFS_I(src_inode)->root);
                fput(src_file);
        }
-
 out:
        kfree(vol_args);
        return ret;
 }
 
+/*
+ * helper to check if the subvolume references other subvolumes
+ */
+static noinline int may_destroy_subvol(struct btrfs_root *root)
+{
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       int ret;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       key.objectid = root->root_key.objectid;
+       key.type = BTRFS_ROOT_REF_KEY;
+       key.offset = (u64)-1;
+
+       ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
+                               &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+       BUG_ON(ret == 0);
+
+       ret = 0;
+       if (path->slots[0] > 0) {
+               path->slots[0]--;
+               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+               if (key.objectid == root->root_key.objectid &&
+                   key.type == BTRFS_ROOT_REF_KEY)
+                       ret = -ENOTEMPTY;
+       }
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static noinline int btrfs_ioctl_snap_destroy(struct file *file,
+                                            void __user *arg)
+{
+       struct dentry *parent = fdentry(file);
+       struct dentry *dentry;
+       struct inode *dir = parent->d_inode;
+       struct inode *inode;
+       struct btrfs_root *root = BTRFS_I(dir)->root;
+       struct btrfs_root *dest = NULL;
+       struct btrfs_ioctl_vol_args *vol_args;
+       struct btrfs_trans_handle *trans;
+       int namelen;
+       int ret;
+       int err = 0;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       vol_args = memdup_user(arg, sizeof(*vol_args));
+       if (IS_ERR(vol_args))
+               return PTR_ERR(vol_args);
+
+       vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+       namelen = strlen(vol_args->name);
+       if (strchr(vol_args->name, '/') ||
+           strncmp(vol_args->name, "..", namelen) == 0) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = mnt_want_write(file->f_path.mnt);
+       if (err)
+               goto out;
+
+       mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+       dentry = lookup_one_len(vol_args->name, parent, namelen);
+       if (IS_ERR(dentry)) {
+               err = PTR_ERR(dentry);
+               goto out_unlock_dir;
+       }
+
+       if (!dentry->d_inode) {
+               err = -ENOENT;
+               goto out_dput;
+       }
+
+       inode = dentry->d_inode;
+       if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+               err = -EINVAL;
+               goto out_dput;
+       }
+
+       dest = BTRFS_I(inode)->root;
+
+       mutex_lock(&inode->i_mutex);
+       err = d_invalidate(dentry);
+       if (err)
+               goto out_unlock;
+
+       down_write(&root->fs_info->subvol_sem);
+
+       err = may_destroy_subvol(dest);
+       if (err)
+               goto out_up_write;
+
+       trans = btrfs_start_transaction(root, 1);
+       ret = btrfs_unlink_subvol(trans, root, dir,
+                               dest->root_key.objectid,
+                               dentry->d_name.name,
+                               dentry->d_name.len);
+       BUG_ON(ret);
+
+       btrfs_record_root_in_trans(trans, dest);
+
+       memset(&dest->root_item.drop_progress, 0,
+               sizeof(dest->root_item.drop_progress));
+       dest->root_item.drop_level = 0;
+       btrfs_set_root_refs(&dest->root_item, 0);
+
+       ret = btrfs_insert_orphan_item(trans,
+                               root->fs_info->tree_root,
+                               dest->root_key.objectid);
+       BUG_ON(ret);
+
+       ret = btrfs_commit_transaction(trans, root);
+       BUG_ON(ret);
+       inode->i_flags |= S_DEAD;
+out_up_write:
+       up_write(&root->fs_info->subvol_sem);
+out_unlock:
+       mutex_unlock(&inode->i_mutex);
+       if (!err) {
+               btrfs_invalidate_inodes(dest);
+               d_delete(dentry);
+       }
+out_dput:
+       dput(dentry);
+out_unlock_dir:
+       mutex_unlock(&dir->i_mutex);
+       mnt_drop_write(file->f_path.mnt);
+out:
+       kfree(vol_args);
+       return err;
+}
+
 static int btrfs_ioctl_defrag(struct file *file)
 {
        struct inode *inode = fdentry(file)->d_inode;
@@ -865,8 +899,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
        return ret;
 }
 
-static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
-               u64 off, u64 olen, u64 destoff)
+static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+                                      u64 off, u64 olen, u64 destoff)
 {
        struct inode *inode = fdentry(file)->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -976,7 +1010,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
        /* punch hole in destination first */
        btrfs_drop_extents(trans, root, inode, off, off + len,
-                          off + len, 0, &hint_byte);
+                          off + len, 0, &hint_byte, 1);
 
        /* clone data */
        key.objectid = src->i_ino;
@@ -1071,8 +1105,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                        datao += off - key.offset;
                                        datal -= off - key.offset;
                                }
-                               if (key.offset + datao + datal + key.offset >
-                                   off + len)
+                               if (key.offset + datao + datal > off + len)
                                        datal = off + len - key.offset - datao;
                                /* disko == 0 means it's a hole */
                                if (!disko)
@@ -1258,6 +1291,8 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_snap_create(file, argp, 0);
        case BTRFS_IOC_SUBVOL_CREATE:
                return btrfs_ioctl_snap_create(file, argp, 1);
+       case BTRFS_IOC_SNAP_DESTROY:
+               return btrfs_ioctl_snap_destroy(file, argp);
        case BTRFS_IOC_DEFRAG:
                return btrfs_ioctl_defrag(file);
        case BTRFS_IOC_RESIZE:
index b320b103fa132dadf345168a80fff04bd41be8b3..bc49914475ebbebc5271f3d26d486d730610178d 100644 (file)
@@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args {
 
 #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
                                   struct btrfs_ioctl_vol_args)
-
+#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
+                               struct btrfs_ioctl_vol_args)
 #endif
index 7b2f401e604e3e4706ba2b8d8b9646949cfca3db..b5d6d24726b0014463f15041bbc9ddd74fdb110e 100644 (file)
@@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
  *
  * len is the length of the extent
  *
- * This also sets the EXTENT_ORDERED bit on the range in the inode.
- *
  * The tree is given a single reference on the ordered extent that was
  * inserted.
  */
@@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        entry->start = start;
        entry->len = len;
        entry->disk_len = disk_len;
+       entry->bytes_left = len;
        entry->inode = inode;
        if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
                set_bit(type, &entry->flags);
@@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                           &entry->rb_node);
        BUG_ON(node);
 
-       set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
-                          entry_end(entry) - 1, GFP_NOFS);
-
        spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
        list_add_tail(&entry->root_extent_list,
                      &BTRFS_I(inode)->root->fs_info->ordered_extents);
@@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
        struct btrfs_ordered_extent *entry;
-       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        int ret;
 
        tree = &BTRFS_I(inode)->ordered_tree;
        mutex_lock(&tree->mutex);
-       clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
-                            GFP_NOFS);
        node = tree_search(tree, file_offset);
        if (!node) {
                ret = 1;
@@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
                goto out;
        }
 
-       ret = test_range_bit(io_tree, entry->file_offset,
-                            entry->file_offset + entry->len - 1,
-                            EXTENT_ORDERED, 0);
-       if (ret == 0)
+       if (io_size > entry->bytes_left) {
+               printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+                      (unsigned long long)entry->bytes_left,
+                      (unsigned long long)io_size);
+       }
+       entry->bytes_left -= io_size;
+       if (entry->bytes_left == 0)
                ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+       else
+               ret = 1;
 out:
        mutex_unlock(&tree->mutex);
        return ret == 0;
@@ -476,6 +474,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
        u64 orig_end;
        u64 wait_end;
        struct btrfs_ordered_extent *ordered;
+       int found;
 
        if (start + len < start) {
                orig_end = INT_LIMIT(loff_t);
@@ -502,6 +501,7 @@ again:
                                           orig_end >> PAGE_CACHE_SHIFT);
 
        end = orig_end;
+       found = 0;
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, end);
                if (!ordered)
@@ -514,6 +514,7 @@ again:
                        btrfs_put_ordered_extent(ordered);
                        break;
                }
+               found++;
                btrfs_start_ordered_extent(inode, ordered, 1);
                end = ordered->file_offset;
                btrfs_put_ordered_extent(ordered);
@@ -521,8 +522,8 @@ again:
                        break;
                end--;
        }
-       if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
-                          EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
+       if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
+                          EXTENT_DELALLOC, 0, NULL)) {
                schedule_timeout(1);
                goto again;
        }
@@ -613,7 +614,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
         */
        if (test_range_bit(io_tree, disk_i_size,
                           ordered->file_offset + ordered->len - 1,
-                          EXTENT_DELALLOC, 0)) {
+                          EXTENT_DELALLOC, 0, NULL)) {
                goto out;
        }
        /*
@@ -664,7 +665,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
         */
        if (i_size_test > entry_end(ordered) &&
            !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
-                          EXTENT_DELALLOC, 0)) {
+                          EXTENT_DELALLOC, 0, NULL)) {
                new_i_size = min_t(u64, i_size_test, i_size_read(inode));
        }
        BTRFS_I(inode)->disk_i_size = new_i_size;
index 3d31c8827b013407d6f4b14796896a6aac8ba53b..993a7ea45c702a580c784908584408684913e233 100644 (file)
@@ -85,6 +85,9 @@ struct btrfs_ordered_extent {
        /* extent length on disk */
        u64 disk_len;
 
+       /* number of bytes that still need writing */
+       u64 bytes_left;
+
        /* flags (described above) */
        unsigned long flags;
 
index 3c0d52af4f806365bcd601208e00f25931ab29f2..79cba5fbc28ef8061e2d599110a3a525216a7bfd 100644 (file)
@@ -65,3 +65,23 @@ out:
        btrfs_free_path(path);
        return ret;
 }
+
+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
+{
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       int ret;
+
+       key.objectid = BTRFS_ORPHAN_OBJECTID;
+       key.type = BTRFS_ORPHAN_ITEM_KEY;
+       key.offset = offset;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+
+       btrfs_free_path(path);
+       return ret;
+}
index c04f7f212602c4bcb71e58db938fcd0d8a9128ad..361ad323faaceb4ed3313bd1e2904ea29ea7d2f3 100644 (file)
@@ -121,6 +121,15 @@ struct inodevec {
        int nr;
 };
 
+#define MAX_EXTENTS 128
+
+struct file_extent_cluster {
+       u64 start;
+       u64 end;
+       u64 boundary[MAX_EXTENTS];
+       unsigned int nr;
+};
+
 struct reloc_control {
        /* block group to relocate */
        struct btrfs_block_group_cache *block_group;
@@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
                                struct reloc_control *rc)
 {
        if (test_range_bit(&rc->processed_blocks, bytenr,
-                          bytenr + blocksize - 1, EXTENT_DIRTY, 1))
+                          bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
                return 1;
        return 0;
 }
@@ -2529,56 +2538,94 @@ out:
 }
 
 static noinline_for_stack
-int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
+int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
+                        u64 block_start)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       struct extent_map *em;
+       int ret = 0;
+
+       em = alloc_extent_map(GFP_NOFS);
+       if (!em)
+               return -ENOMEM;
+
+       em->start = start;
+       em->len = end + 1 - start;
+       em->block_len = em->len;
+       em->block_start = block_start;
+       em->bdev = root->fs_info->fs_devices->latest_bdev;
+       set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+       lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+       while (1) {
+               write_lock(&em_tree->lock);
+               ret = add_extent_mapping(em_tree, em);
+               write_unlock(&em_tree->lock);
+               if (ret != -EEXIST) {
+                       free_extent_map(em);
+                       break;
+               }
+               btrfs_drop_extent_cache(inode, start, end, 0);
+       }
+       unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+       return ret;
+}
+
+static int relocate_file_extent_cluster(struct inode *inode,
+                                       struct file_extent_cluster *cluster)
 {
        u64 page_start;
        u64 page_end;
-       unsigned long i;
-       unsigned long first_index;
+       u64 offset = BTRFS_I(inode)->index_cnt;
+       unsigned long index;
        unsigned long last_index;
-       unsigned int total_read = 0;
-       unsigned int total_dirty = 0;
+       unsigned int dirty_page = 0;
        struct page *page;
        struct file_ra_state *ra;
-       struct btrfs_ordered_extent *ordered;
-       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+       int nr = 0;
        int ret = 0;
 
+       if (!cluster->nr)
+               return 0;
+
        ra = kzalloc(sizeof(*ra), GFP_NOFS);
        if (!ra)
                return -ENOMEM;
 
+       index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+       last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+
        mutex_lock(&inode->i_mutex);
-       first_index = start >> PAGE_CACHE_SHIFT;
-       last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
 
-       /* make sure the dirty trick played by the caller work */
-       while (1) {
-               ret = invalidate_inode_pages2_range(inode->i_mapping,
-                                                   first_index, last_index);
-               if (ret != -EBUSY)
-                       break;
-               schedule_timeout(HZ/10);
-       }
+       i_size_write(inode, cluster->end + 1 - offset);
+       ret = setup_extent_mapping(inode, cluster->start - offset,
+                                  cluster->end - offset, cluster->start);
        if (ret)
                goto out_unlock;
 
        file_ra_state_init(ra, inode->i_mapping);
 
-       for (i = first_index ; i <= last_index; i++) {
-               if (total_read % ra->ra_pages == 0) {
-                       btrfs_force_ra(inode->i_mapping, ra, NULL, i,
-                               min(last_index, ra->ra_pages + i - 1));
-               }
-               total_read++;
-again:
-               if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
-                       BUG_ON(1);
-               page = grab_cache_page(inode->i_mapping, i);
+       WARN_ON(cluster->start != cluster->boundary[0]);
+       while (index <= last_index) {
+               page = find_lock_page(inode->i_mapping, index);
                if (!page) {
-                       ret = -ENOMEM;
-                       goto out_unlock;
+                       page_cache_sync_readahead(inode->i_mapping,
+                                                 ra, NULL, index,
+                                                 last_index + 1 - index);
+                       page = grab_cache_page(inode->i_mapping, index);
+                       if (!page) {
+                               ret = -ENOMEM;
+                               goto out_unlock;
+                       }
+               }
+
+               if (PageReadahead(page)) {
+                       page_cache_async_readahead(inode->i_mapping,
+                                                  ra, NULL, page, index,
+                                                  last_index + 1 - index);
                }
+
                if (!PageUptodate(page)) {
                        btrfs_readpage(NULL, page);
                        lock_page(page);
@@ -2589,75 +2636,79 @@ again:
                                goto out_unlock;
                        }
                }
-               wait_on_page_writeback(page);
 
                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
                page_end = page_start + PAGE_CACHE_SIZE - 1;
-               lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-
-               ordered = btrfs_lookup_ordered_extent(inode, page_start);
-               if (ordered) {
-                       unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-                       unlock_page(page);
-                       page_cache_release(page);
-                       btrfs_start_ordered_extent(inode, ordered, 1);
-                       btrfs_put_ordered_extent(ordered);
-                       goto again;
-               }
+
+               lock_extent(&BTRFS_I(inode)->io_tree,
+                           page_start, page_end, GFP_NOFS);
+
                set_page_extent_mapped(page);
 
-               if (i == first_index)
-                       set_extent_bits(io_tree, page_start, page_end,
+               if (nr < cluster->nr &&
+                   page_start + offset == cluster->boundary[nr]) {
+                       set_extent_bits(&BTRFS_I(inode)->io_tree,
+                                       page_start, page_end,
                                        EXTENT_BOUNDARY, GFP_NOFS);
+                       nr++;
+               }
                btrfs_set_extent_delalloc(inode, page_start, page_end);
 
                set_page_dirty(page);
-               total_dirty++;
+               dirty_page++;
 
-               unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+               unlock_extent(&BTRFS_I(inode)->io_tree,
+                             page_start, page_end, GFP_NOFS);
                unlock_page(page);
                page_cache_release(page);
+
+               index++;
+               if (nr < cluster->nr &&
+                   page_end + 1 + offset == cluster->boundary[nr]) {
+                       balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+                                                          dirty_page);
+                       dirty_page = 0;
+               }
+       }
+       if (dirty_page) {
+               balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+                                                  dirty_page);
        }
+       WARN_ON(nr != cluster->nr);
 out_unlock:
        mutex_unlock(&inode->i_mutex);
        kfree(ra);
-       balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
        return ret;
 }
 
 static noinline_for_stack
-int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
+int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
+                        struct file_extent_cluster *cluster)
 {
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-       struct extent_map *em;
-       u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
-       u64 end = start + extent_key->offset - 1;
-
-       em = alloc_extent_map(GFP_NOFS);
-       em->start = start;
-       em->len = extent_key->offset;
-       em->block_len = extent_key->offset;
-       em->block_start = extent_key->objectid;
-       em->bdev = root->fs_info->fs_devices->latest_bdev;
-       set_bit(EXTENT_FLAG_PINNED, &em->flags);
+       int ret;
 
-       /* setup extent map to cheat btrfs_readpage */
-       lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
-       while (1) {
-               int ret;
-               spin_lock(&em_tree->lock);
-               ret = add_extent_mapping(em_tree, em);
-               spin_unlock(&em_tree->lock);
-               if (ret != -EEXIST) {
-                       free_extent_map(em);
-                       break;
-               }
-               btrfs_drop_extent_cache(inode, start, end, 0);
+       if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
+               ret = relocate_file_extent_cluster(inode, cluster);
+               if (ret)
+                       return ret;
+               cluster->nr = 0;
        }
-       unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
 
-       return relocate_inode_pages(inode, start, extent_key->offset);
+       if (!cluster->nr)
+               cluster->start = extent_key->objectid;
+       else
+               BUG_ON(cluster->nr >= MAX_EXTENTS);
+       cluster->end = extent_key->objectid + extent_key->offset - 1;
+       cluster->boundary[cluster->nr] = extent_key->objectid;
+       cluster->nr++;
+
+       if (cluster->nr >= MAX_EXTENTS) {
+               ret = relocate_file_extent_cluster(inode, cluster);
+               if (ret)
+                       return ret;
+               cluster->nr = 0;
+       }
+       return 0;
 }
 
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
@@ -3203,10 +3254,12 @@ static int check_extent_flags(u64 flags)
        return 0;
 }
 
+
 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 {
        struct rb_root blocks = RB_ROOT;
        struct btrfs_key key;
+       struct file_extent_cluster *cluster;
        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_path *path;
        struct btrfs_extent_item *ei;
@@ -3216,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        int ret;
        int err = 0;
 
+       cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
+       if (!cluster)
+               return -ENOMEM;
+
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
 
+       rc->extents_found = 0;
+       rc->extents_skipped = 0;
+
        rc->search_start = rc->block_group->key.objectid;
        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
                          GFP_NOFS);
@@ -3306,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                }
 
                nr = trans->blocks_used;
-               btrfs_end_transaction_throttle(trans, rc->extent_root);
+               btrfs_end_transaction(trans, rc->extent_root);
                trans = NULL;
                btrfs_btree_balance_dirty(rc->extent_root, nr);
 
                if (rc->stage == MOVE_DATA_EXTENTS &&
                    (flags & BTRFS_EXTENT_FLAG_DATA)) {
                        rc->found_file_extent = 1;
-                       ret = relocate_data_extent(rc->data_inode, &key);
+                       ret = relocate_data_extent(rc->data_inode,
+                                                  &key, cluster);
                        if (ret < 0) {
                                err = ret;
                                break;
@@ -3328,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                btrfs_btree_balance_dirty(rc->extent_root, nr);
        }
 
+       if (!err) {
+               ret = relocate_file_extent_cluster(rc->data_inode, cluster);
+               if (ret < 0)
+                       err = ret;
+       }
+
+       kfree(cluster);
+
        rc->create_reloc_root = 0;
        smp_mb();
 
@@ -3348,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 }
 
 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root,
-                                u64 objectid, u64 size)
+                                struct btrfs_root *root, u64 objectid)
 {
        struct btrfs_path *path;
        struct btrfs_inode_item *item;
@@ -3368,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
        memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
        btrfs_set_inode_generation(leaf, item, 1);
-       btrfs_set_inode_size(leaf, item, size);
+       btrfs_set_inode_size(leaf, item, 0);
        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
        btrfs_mark_buffer_dirty(leaf);
@@ -3404,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
        if (err)
                goto out;
 
-       err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
-       BUG_ON(err);
-
-       err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
-                                      group->key.offset, 0, group->key.offset,
-                                      0, 0, 0);
+       err = __insert_orphan_inode(trans, root, objectid);
        BUG_ON(err);
 
        key.objectid = objectid;
@@ -3475,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        btrfs_wait_ordered_extents(fs_info->tree_root, 0);
 
        while (1) {
-               mutex_lock(&fs_info->cleaner_mutex);
-               btrfs_clean_old_snapshots(fs_info->tree_root);
-               mutex_unlock(&fs_info->cleaner_mutex);
-
                rc->extents_found = 0;
                rc->extents_skipped = 0;
 
+               mutex_lock(&fs_info->cleaner_mutex);
+
+               btrfs_clean_old_snapshots(fs_info->tree_root);
                ret = relocate_block_group(rc);
+
+               mutex_unlock(&fs_info->cleaner_mutex);
                if (ret < 0) {
                        err = ret;
                        break;
@@ -3514,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                }
        }
 
-       filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
-                                rc->block_group->key.objectid,
-                                rc->block_group->key.objectid +
-                                rc->block_group->key.offset - 1);
+       filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
+                                    rc->block_group->key.objectid,
+                                    rc->block_group->key.objectid +
+                                    rc->block_group->key.offset - 1);
 
        WARN_ON(rc->block_group->pinned > 0);
        WARN_ON(rc->block_group->reserved > 0);
@@ -3530,6 +3594,26 @@ out:
        return err;
 }
 
+static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
+{
+       struct btrfs_trans_handle *trans;
+       int ret;
+
+       trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
+
+       memset(&root->root_item.drop_progress, 0,
+               sizeof(root->root_item.drop_progress));
+       root->root_item.drop_level = 0;
+       btrfs_set_root_refs(&root->root_item, 0);
+       ret = btrfs_update_root(trans, root->fs_info->tree_root,
+                               &root->root_key, &root->root_item);
+       BUG_ON(ret);
+
+       ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
+       BUG_ON(ret);
+       return 0;
+}
+
 /*
  * recover relocation interrupted by system crash.
  *
@@ -3589,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
                        fs_root = read_fs_root(root->fs_info,
                                               reloc_root->root_key.offset);
                        if (IS_ERR(fs_root)) {
-                               err = PTR_ERR(fs_root);
-                               goto out;
+                               ret = PTR_ERR(fs_root);
+                               if (ret != -ENOENT) {
+                                       err = ret;
+                                       goto out;
+                               }
+                               mark_garbage_root(reloc_root);
                        }
                }
 
index 0ddc6d61c55a7135bd94c15c76644011c6e9fd4c..9351428f30e2129343e7c9b882b1fbb26e56c874 100644 (file)
@@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
                goto out;
 
        BUG_ON(ret == 0);
+       if (path->slots[0] == 0) {
+               ret = 1;
+               goto out;
+       }
        l = path->nodes[0];
-       BUG_ON(path->slots[0] == 0);
        slot = path->slots[0] - 1;
        btrfs_item_key_to_cpu(l, &found_key, slot);
-       if (found_key.objectid != objectid) {
+       if (found_key.objectid != objectid ||
+           found_key.type != BTRFS_ROOT_ITEM_KEY) {
                ret = 1;
                goto out;
        }
-       read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
-                          sizeof(*item));
-       memcpy(key, &found_key, sizeof(found_key));
+       if (item)
+               read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
+                                  sizeof(*item));
+       if (key)
+               memcpy(key, &found_key, sizeof(found_key));
        ret = 0;
 out:
        btrfs_free_path(path);
@@ -249,6 +255,59 @@ err:
        return ret;
 }
 
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
+{
+       struct extent_buffer *leaf;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       int err = 0;
+       int ret;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       key.objectid = BTRFS_ORPHAN_OBJECTID;
+       key.type = BTRFS_ORPHAN_ITEM_KEY;
+       key.offset = 0;
+
+       while (1) {
+               ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+               if (ret < 0) {
+                       err = ret;
+                       break;
+               }
+
+               leaf = path->nodes[0];
+               if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                       ret = btrfs_next_leaf(tree_root, path);
+                       if (ret < 0)
+                               err = ret;
+                       if (ret != 0)
+                               break;
+                       leaf = path->nodes[0];
+               }
+
+               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+               btrfs_release_path(tree_root, path);
+
+               if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
+                   key.type != BTRFS_ORPHAN_ITEM_KEY)
+                       break;
+
+               ret = btrfs_find_dead_roots(tree_root, key.offset);
+               if (ret) {
+                       err = ret;
+                       break;
+               }
+
+               key.offset++;
+       }
+
+       btrfs_free_path(path);
+       return err;
+}
+
 /* drop the root item for 'key' from 'root' */
 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                   struct btrfs_key *key)
@@ -278,31 +337,57 @@ out:
        return ret;
 }
 
-#if 0 /* this will get used when snapshot deletion is implemented */
 int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
                       struct btrfs_root *tree_root,
-                      u64 root_id, u8 type, u64 ref_id)
+                      u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
+                      const char *name, int name_len)
+
 {
+       struct btrfs_path *path;
+       struct btrfs_root_ref *ref;
+       struct extent_buffer *leaf;
        struct btrfs_key key;
+       unsigned long ptr;
+       int err = 0;
        int ret;
-       struct btrfs_path *path;
 
        path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
 
        key.objectid = root_id;
-       key.type = type;
+       key.type = BTRFS_ROOT_BACKREF_KEY;
        key.offset = ref_id;
-
+again:
        ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
-       BUG_ON(ret);
-
-       ret = btrfs_del_item(trans, tree_root, path);
-       BUG_ON(ret);
+       BUG_ON(ret < 0);
+       if (ret == 0) {
+               leaf = path->nodes[0];
+               ref = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_root_ref);
+
+               WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
+               WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
+               ptr = (unsigned long)(ref + 1);
+               WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
+               *sequence = btrfs_root_ref_sequence(leaf, ref);
+
+               ret = btrfs_del_item(trans, tree_root, path);
+               BUG_ON(ret);
+       } else
+               err = -ENOENT;
+
+       if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+               btrfs_release_path(tree_root, path);
+               key.objectid = ref_id;
+               key.type = BTRFS_ROOT_REF_KEY;
+               key.offset = root_id;
+               goto again;
+       }
 
        btrfs_free_path(path);
-       return ret;
+       return err;
 }
-#endif
 
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
                   struct btrfs_path *path,
@@ -319,7 +404,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
        return ret;
 }
 
-
 /*
  * add a btrfs_root_ref item.  type is either BTRFS_ROOT_REF_KEY
  * or BTRFS_ROOT_BACKREF_KEY.
@@ -335,8 +419,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
  */
 int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
                       struct btrfs_root *tree_root,
-                      u64 root_id, u8 type, u64 ref_id,
-                      u64 dirid, u64 sequence,
+                      u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
                       const char *name, int name_len)
 {
        struct btrfs_key key;
@@ -346,13 +429,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
        struct extent_buffer *leaf;
        unsigned long ptr;
 
-
        path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
 
        key.objectid = root_id;
-       key.type = type;
+       key.type = BTRFS_ROOT_BACKREF_KEY;
        key.offset = ref_id;
-
+again:
        ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
                                      sizeof(*ref) + name_len);
        BUG_ON(ret);
@@ -366,6 +450,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
        write_extent_buffer(leaf, name, ptr, name_len);
        btrfs_mark_buffer_dirty(leaf);
 
+       if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+               btrfs_release_path(tree_root, path);
+               key.objectid = ref_id;
+               key.type = BTRFS_ROOT_REF_KEY;
+               key.offset = root_id;
+               goto again;
+       }
+
        btrfs_free_path(path);
-       return ret;
+       return 0;
 }
index 2db17cd66fc55d32ba19a77cee7ea54b170efa34..67035385444cf29c31a59951311af66702fa80a8 100644 (file)
@@ -676,6 +676,7 @@ static int btrfs_unfreeze(struct super_block *sb)
 }
 
 static const struct super_operations btrfs_super_ops = {
+       .drop_inode     = btrfs_drop_inode,
        .delete_inode   = btrfs_delete_inode,
        .put_super      = btrfs_put_super,
        .sync_fs        = btrfs_sync_fs,
index cdbb5022da52df1e2b33593650401021077ba280..88f866f85e7affa4b387cc3cc72fae5a472bb786 100644 (file)
@@ -104,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
 {
        if (root->ref_cows && root->last_trans < trans->transid) {
                WARN_ON(root == root->fs_info->extent_root);
-               WARN_ON(root->root_item.refs == 0);
                WARN_ON(root->commit_root != root->node);
 
                radix_tree_tag_set(&root->fs_info->fs_roots_radix,
@@ -720,7 +719,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
 
        key.objectid = objectid;
-       key.offset = 0;
+       /* record when the snapshot was created in key.offset */
+       key.offset = trans->transid;
        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
        old = btrfs_lock_root_node(root);
@@ -778,24 +778,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
        ret = btrfs_update_inode(trans, parent_root, parent_inode);
        BUG_ON(ret);
 
-       /* add the backref first */
        ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
                                 pending->root_key.objectid,
-                                BTRFS_ROOT_BACKREF_KEY,
                                 parent_root->root_key.objectid,
                                 parent_inode->i_ino, index, pending->name,
                                 namelen);
 
        BUG_ON(ret);
 
-       /* now add the forward ref */
-       ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
-                                parent_root->root_key.objectid,
-                                BTRFS_ROOT_REF_KEY,
-                                pending->root_key.objectid,
-                                parent_inode->i_ino, index, pending->name,
-                                namelen);
-
        inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
        d_instantiate(pending->dentry, inode);
 fail:
@@ -874,7 +864,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        unsigned long timeout = 1;
        struct btrfs_transaction *cur_trans;
        struct btrfs_transaction *prev_trans = NULL;
-       struct extent_io_tree *pinned_copy;
        DEFINE_WAIT(wait);
        int ret;
        int should_grow = 0;
@@ -915,13 +904,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                return 0;
        }
 
-       pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
-       if (!pinned_copy)
-               return -ENOMEM;
-
-       extent_io_tree_init(pinned_copy,
-                            root->fs_info->btree_inode->i_mapping, GFP_NOFS);
-
        trans->transaction->in_commit = 1;
        trans->transaction->blocked = 1;
        if (cur_trans->list.prev != &root->fs_info->trans_list) {
@@ -1019,6 +1001,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        ret = commit_cowonly_roots(trans, root);
        BUG_ON(ret);
 
+       btrfs_prepare_extent_commit(trans, root);
+
        cur_trans = root->fs_info->running_transaction;
        spin_lock(&root->fs_info->new_trans_lock);
        root->fs_info->running_transaction = NULL;
@@ -1042,8 +1026,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
               sizeof(root->fs_info->super_copy));
 
-       btrfs_copy_pinned(root, pinned_copy);
-
        trans->transaction->blocked = 0;
 
        wake_up(&root->fs_info->transaction_wait);
@@ -1059,8 +1041,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         */
        mutex_unlock(&root->fs_info->tree_log_mutex);
 
-       btrfs_finish_extent_commit(trans, root, pinned_copy);
-       kfree(pinned_copy);
+       btrfs_finish_extent_commit(trans, root);
 
        /* do the directory inserts of any pending snapshot creations */
        finish_pending_snapshots(trans, root->fs_info);
@@ -1096,8 +1077,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
 
        while (!list_empty(&list)) {
                root = list_entry(list.next, struct btrfs_root, root_list);
-               list_del_init(&root->root_list);
-               btrfs_drop_snapshot(root, 0);
+               list_del(&root->root_list);
+
+               if (btrfs_header_backref_rev(root->node) <
+                   BTRFS_MIXED_BACKREF_REV)
+                       btrfs_drop_snapshot(root, 0);
+               else
+                       btrfs_drop_snapshot(root, 1);
        }
        return 0;
 }
index 30c0d45c1b5e6d7bfda1a07748ac8d86b9dd0b7d..7827841b55cbd5399606cd02820055f2ff6eded2 100644 (file)
@@ -263,8 +263,8 @@ static int process_one_buffer(struct btrfs_root *log,
                              struct walk_control *wc, u64 gen)
 {
        if (wc->pin)
-               btrfs_update_pinned_extents(log->fs_info->extent_root,
-                                           eb->start, eb->len, 1);
+               btrfs_pin_extent(log->fs_info->extent_root,
+                                eb->start, eb->len, 0);
 
        if (btrfs_buffer_uptodate(eb, gen)) {
                if (wc->write)
@@ -534,7 +534,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
        saved_nbytes = inode_get_bytes(inode);
        /* drop any overlapping extents */
        ret = btrfs_drop_extents(trans, root, inode,
-                        start, extent_end, extent_end, start, &alloc_hint);
+                        start, extent_end, extent_end, start, &alloc_hint, 1);
        BUG_ON(ret);
 
        if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -2841,7 +2841,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
                if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
                        break;
 
-               if (parent == sb->s_root)
+               if (IS_ROOT(parent))
                        break;
 
                parent = parent->d_parent;
@@ -2880,6 +2880,12 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                goto end_no_trans;
        }
 
+       if (root != BTRFS_I(inode)->root ||
+           btrfs_root_refs(&root->root_item) == 0) {
+               ret = 1;
+               goto end_no_trans;
+       }
+
        ret = check_parent_dirs_for_sync(trans, inode, parent,
                                         sb, last_committed);
        if (ret)
@@ -2907,12 +2913,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                        break;
 
                inode = parent->d_inode;
+               if (root != BTRFS_I(inode)->root)
+                       break;
+
                if (BTRFS_I(inode)->generation >
                    root->fs_info->last_trans_committed) {
                        ret = btrfs_log_inode(trans, root, inode, inode_only);
                        BUG_ON(ret);
                }
-               if (parent == sb->s_root)
+               if (IS_ROOT(parent))
                        break;
 
                parent = parent->d_parent;
@@ -2951,7 +2960,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
        struct btrfs_key tmp_key;
        struct btrfs_root *log;
        struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
-       u64 highest_inode;
        struct walk_control wc = {
                .process_func = process_one_buffer,
                .stage = 0,
@@ -3010,11 +3018,6 @@ again:
                                                      path);
                        BUG_ON(ret);
                }
-               ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
-               if (ret == 0) {
-                       wc.replay_dest->highest_inode = highest_inode;
-                       wc.replay_dest->last_inode_alloc = highest_inode;
-               }
 
                key.offset = found_key.offset - 1;
                wc.replay_dest->log_root = NULL;
index 5cf405b0828d8295d1fcd514841dbe6b21afe1c4..23e7d36ff32554eb7e781ab7c8e9b8102fddc235 100644 (file)
@@ -276,7 +276,7 @@ loop_lock:
                 * is now congested.  Back off and let other work structs
                 * run instead
                 */
-               if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
+               if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
                    fs_info->fs_devices->open_devices > 1) {
                        struct io_context *ioc;
 
@@ -719,10 +719,9 @@ error:
  * called very infrequently and that a given device has a small number
  * of extents
  */
-static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
-                                        struct btrfs_device *device,
-                                        u64 num_bytes, u64 *start,
-                                        u64 *max_avail)
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+                        struct btrfs_device *device, u64 num_bytes,
+                        u64 *start, u64 *max_avail)
 {
        struct btrfs_key key;
        struct btrfs_root *root = device->dev_root;
@@ -1736,6 +1735,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
        extent_root = root->fs_info->extent_root;
        em_tree = &root->fs_info->mapping_tree.map_tree;
 
+       ret = btrfs_can_relocate(extent_root, chunk_offset);
+       if (ret)
+               return -ENOSPC;
+
        /* step one, relocate all the extents inside this chunk */
        ret = btrfs_relocate_block_group(extent_root, chunk_offset);
        BUG_ON(ret);
@@ -1749,9 +1752,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
         * step two, delete the device extents and the
         * chunk tree entries
         */
-       spin_lock(&em_tree->lock);
+       read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, chunk_offset, 1);
-       spin_unlock(&em_tree->lock);
+       read_unlock(&em_tree->lock);
 
        BUG_ON(em->start > chunk_offset ||
               em->start + em->len < chunk_offset);
@@ -1780,9 +1783,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
        ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
        BUG_ON(ret);
 
-       spin_lock(&em_tree->lock);
+       write_lock(&em_tree->lock);
        remove_extent_mapping(em_tree, em);
-       spin_unlock(&em_tree->lock);
+       write_unlock(&em_tree->lock);
 
        kfree(map);
        em->bdev = NULL;
@@ -1807,12 +1810,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
        struct btrfs_key found_key;
        u64 chunk_tree = chunk_root->root_key.objectid;
        u64 chunk_type;
+       bool retried = false;
+       int failed = 0;
        int ret;
 
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
 
+again:
        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
        key.offset = (u64)-1;
        key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -1842,7 +1848,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
                        ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
                                                   found_key.objectid,
                                                   found_key.offset);
-                       BUG_ON(ret);
+                       if (ret == -ENOSPC)
+                               failed++;
+                       else if (ret)
+                               BUG();
                }
 
                if (found_key.offset == 0)
@@ -1850,6 +1859,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
                key.offset = found_key.offset - 1;
        }
        ret = 0;
+       if (failed && !retried) {
+               failed = 0;
+               retried = true;
+               goto again;
+       } else if (failed && retried) {
+               WARN_ON(1);
+               ret = -ENOSPC;
+       }
 error:
        btrfs_free_path(path);
        return ret;
@@ -1894,6 +1911,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
                        continue;
 
                ret = btrfs_shrink_device(device, old_size - size_to_free);
+               if (ret == -ENOSPC)
+                       break;
                BUG_ON(ret);
 
                trans = btrfs_start_transaction(dev_root, 1);
@@ -1938,9 +1957,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
                chunk = btrfs_item_ptr(path->nodes[0],
                                       path->slots[0],
                                       struct btrfs_chunk);
-               key.offset = found_key.offset;
                /* chunk zero is special */
-               if (key.offset == 0)
+               if (found_key.offset == 0)
                        break;
 
                btrfs_release_path(chunk_root, path);
@@ -1948,7 +1966,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
                                           chunk_root->root_key.objectid,
                                           found_key.objectid,
                                           found_key.offset);
-               BUG_ON(ret);
+               BUG_ON(ret && ret != -ENOSPC);
+               key.offset = found_key.offset - 1;
        }
        ret = 0;
 error:
@@ -1974,10 +1993,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
        u64 chunk_offset;
        int ret;
        int slot;
+       int failed = 0;
+       bool retried = false;
        struct extent_buffer *l;
        struct btrfs_key key;
        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
        u64 old_total = btrfs_super_total_bytes(super_copy);
+       u64 old_size = device->total_bytes;
        u64 diff = device->total_bytes - new_size;
 
        if (new_size >= device->total_bytes)
@@ -1987,12 +2009,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
        if (!path)
                return -ENOMEM;
 
-       trans = btrfs_start_transaction(root, 1);
-       if (!trans) {
-               ret = -ENOMEM;
-               goto done;
-       }
-
        path->reada = 2;
 
        lock_chunks(root);
@@ -2001,8 +2017,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
        if (device->writeable)
                device->fs_devices->total_rw_bytes -= diff;
        unlock_chunks(root);
-       btrfs_end_transaction(trans, root);
 
+again:
        key.objectid = device->devid;
        key.offset = (u64)-1;
        key.type = BTRFS_DEV_EXTENT_KEY;
@@ -2017,6 +2033,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
                        goto done;
                if (ret) {
                        ret = 0;
+                       btrfs_release_path(root, path);
                        break;
                }
 
@@ -2024,14 +2041,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
                slot = path->slots[0];
                btrfs_item_key_to_cpu(l, &key, path->slots[0]);
 
-               if (key.objectid != device->devid)
+               if (key.objectid != device->devid) {
+                       btrfs_release_path(root, path);
                        break;
+               }
 
                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
                length = btrfs_dev_extent_length(l, dev_extent);
 
-               if (key.offset + length <= new_size)
+               if (key.offset + length <= new_size) {
+                       btrfs_release_path(root, path);
                        break;
+               }
 
                chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
                chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -2040,8 +2061,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 
                ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
                                           chunk_offset);
-               if (ret)
+               if (ret && ret != -ENOSPC)
                        goto done;
+               if (ret == -ENOSPC)
+                       failed++;
+               key.offset -= 1;
+       }
+
+       if (failed && !retried) {
+               failed = 0;
+               retried = true;
+               goto again;
+       } else if (failed && retried) {
+               ret = -ENOSPC;
+               lock_chunks(root);
+
+               device->total_bytes = old_size;
+               if (device->writeable)
+                       device->fs_devices->total_rw_bytes += diff;
+               unlock_chunks(root);
+               goto done;
        }
 
        /* Shrinking succeeded, else we would be at "done". */
@@ -2294,9 +2333,9 @@ again:
        em->block_len = em->len;
 
        em_tree = &extent_root->fs_info->mapping_tree.map_tree;
-       spin_lock(&em_tree->lock);
+       write_lock(&em_tree->lock);
        ret = add_extent_mapping(em_tree, em);
-       spin_unlock(&em_tree->lock);
+       write_unlock(&em_tree->lock);
        BUG_ON(ret);
        free_extent_map(em);
 
@@ -2491,9 +2530,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
        int readonly = 0;
        int i;
 
-       spin_lock(&map_tree->map_tree.lock);
+       read_lock(&map_tree->map_tree.lock);
        em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
-       spin_unlock(&map_tree->map_tree.lock);
+       read_unlock(&map_tree->map_tree.lock);
        if (!em)
                return 1;
 
@@ -2518,11 +2557,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
        struct extent_map *em;
 
        while (1) {
-               spin_lock(&tree->map_tree.lock);
+               write_lock(&tree->map_tree.lock);
                em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
                if (em)
                        remove_extent_mapping(&tree->map_tree, em);
-               spin_unlock(&tree->map_tree.lock);
+               write_unlock(&tree->map_tree.lock);
                if (!em)
                        break;
                kfree(em->bdev);
@@ -2540,9 +2579,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
        struct extent_map_tree *em_tree = &map_tree->map_tree;
        int ret;
 
-       spin_lock(&em_tree->lock);
+       read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, logical, len);
-       spin_unlock(&em_tree->lock);
+       read_unlock(&em_tree->lock);
        BUG_ON(!em);
 
        BUG_ON(em->start > logical || em->start + em->len < logical);
@@ -2604,9 +2643,9 @@ again:
                atomic_set(&multi->error, 0);
        }
 
-       spin_lock(&em_tree->lock);
+       read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, logical, *length);
-       spin_unlock(&em_tree->lock);
+       read_unlock(&em_tree->lock);
 
        if (!em && unplug_page)
                return 0;
@@ -2763,9 +2802,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
        u64 stripe_nr;
        int i, j, nr = 0;
 
-       spin_lock(&em_tree->lock);
+       read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, chunk_start, 1);
-       spin_unlock(&em_tree->lock);
+       read_unlock(&em_tree->lock);
 
        BUG_ON(!em || em->start != chunk_start);
        map = (struct map_lookup *)em->bdev;
@@ -3053,9 +3092,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
        logical = key->offset;
        length = btrfs_chunk_length(leaf, chunk);
 
-       spin_lock(&map_tree->map_tree.lock);
+       read_lock(&map_tree->map_tree.lock);
        em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
-       spin_unlock(&map_tree->map_tree.lock);
+       read_unlock(&map_tree->map_tree.lock);
 
        /* already mapped? */
        if (em && em->start <= logical && em->start + em->len > logical) {
@@ -3114,9 +3153,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
                map->stripes[i].dev->in_fs_metadata = 1;
        }
 
-       spin_lock(&map_tree->map_tree.lock);
+       write_lock(&map_tree->map_tree.lock);
        ret = add_extent_mapping(&map_tree->map_tree, em);
-       spin_unlock(&map_tree->map_tree.lock);
+       write_unlock(&map_tree->map_tree.lock);
        BUG_ON(ret);
        free_extent_map(em);
 
index 5139a833f721353a32f13ed1c52c3647c0951776..31b0fabdd2ea7da5489a4bffaa065c59e9282119 100644 (file)
@@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root);
 void btrfs_unlock_volumes(void);
 void btrfs_lock_volumes(void);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+                        struct btrfs_device *device, u64 num_bytes,
+                        u64 *start, u64 *max_avail);
 #endif