btrfs: don't use global reserve for chunk allocation

[sfrench/cifs-2.6.git] / fs / btrfs / extent-tree.c
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index d81035b7ea7d597e229975691fef48f8813a908a..d637f4c4bcd076138414e9ec779785d808eb8053 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3013,8 +3013,7 @@ again:
         }
  
         if (run_all) {
-               if (!list_empty(&trans->new_bgs))
-                       btrfs_create_pending_block_groups(trans);
+               btrfs_create_pending_block_groups(trans);
  
                 spin_lock(&delayed_refs->lock);
                 node = rb_first_cached(&delayed_refs->href_root);
@@ -4280,10 +4279,14 @@ commit_trans:
                                 /*
                                  * The cleaner kthread might still be doing iput
                                  * operations. Wait for it to finish so that
-                                * more space is released.
+                                * more space is released.  We don't need to
+                                * explicitly run the delayed iputs here because
+                                * the commit_transaction would have woken up
+                                * the cleaner.
                                  */
-                               mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
-                               mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
+                               ret = btrfs_wait_on_delayed_iputs(fs_info);
+                               if (ret)
+                                       return ret;
                                 goto again;
                         } else {
                                 btrfs_end_transaction(trans);
@@ -4396,21 +4399,12 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
  static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
                               struct btrfs_space_info *sinfo, int force)
  {
-       struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
         u64 bytes_used = btrfs_space_info_used(sinfo, false);
         u64 thresh;
  
         if (force == CHUNK_ALLOC_FORCE)
                 return 1;
  
-       /*
-        * We need to take into account the global rsv because for all intents
-        * and purposes it's used space.  Don't worry about locking the
-        * global_rsv, it doesn't change except when the transaction commits.
-        */
-       if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
-               bytes_used += calc_global_rsv_need_space(global_rsv);
-
         /*
          * in limited mode, we want to have some free space up to
          * about 1% of the FS size.
@@ -4741,7 +4735,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
         struct btrfs_space_info *space_info;
         struct btrfs_trans_handle *trans;
         u64 delalloc_bytes;
-       u64 max_reclaim;
+       u64 async_pages;
         u64 items;
         long time_left;
         unsigned long nr_pages;
@@ -4766,25 +4760,36 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
  
         loops = 0;
         while (delalloc_bytes && loops < 3) {
-               max_reclaim = min(delalloc_bytes, to_reclaim);
-               nr_pages = max_reclaim >> PAGE_SHIFT;
+               nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+
+               /*
+                * Triggers inode writeback for up to nr_pages. This will invoke
+                * ->writepages callback and trigger delalloc filling
+                *  (btrfs_run_delalloc_range()).
+                */
                 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
+
                 /*
-                * We need to wait for the async pages to actually start before
-                * we do anything.
+                * We need to wait for the compressed pages to start before
+                * we continue.
                  */
-               max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
-               if (!max_reclaim)
+               async_pages = atomic_read(&fs_info->async_delalloc_pages);
+               if (!async_pages)
                         goto skip_async;
  
-               if (max_reclaim <= nr_pages)
-                       max_reclaim = 0;
+               /*
+                * Calculate how many compressed pages we want to be written
+                * before we continue. I.e if there are more async pages than we
+                * require wait_event will wait until nr_pages are written.
+                */
+               if (async_pages <= nr_pages)
+                       async_pages = 0;
                 else
-                       max_reclaim -= nr_pages;
+                       async_pages -= nr_pages;
  
                 wait_event(fs_info->async_submit_wait,
                            atomic_read(&fs_info->async_delalloc_pages) <=
-                          (int)max_reclaim);
+                          (int)async_pages);
  skip_async:
                 spin_lock(&space_info->lock);
                 if (list_empty(&space_info->tickets) &&
@@ -4851,10 +4856,19 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
         if (!bytes_needed)
                 return 0;
  
-       /* See if there is enough pinned space to make this reservation */
-       if (__percpu_counter_compare(&space_info->total_bytes_pinned,
-                                  bytes_needed,
-                                  BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
+       trans = btrfs_join_transaction(fs_info->extent_root);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
+
+       /*
+        * See if there is enough pinned space to make this reservation, or if
+        * we have block groups that are going to be freed, allowing us to
+        * possibly do a chunk allocation the next loop through.
+        */
+       if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
+           __percpu_counter_compare(&space_info->total_bytes_pinned,
+                                    bytes_needed,
+                                    BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
                 goto commit;
  
         /*
@@ -4862,7 +4876,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
          * this reservation.
          */
         if (space_info != delayed_rsv->space_info)
-               return -ENOSPC;
+               goto enospc;
  
         spin_lock(&delayed_rsv->lock);
         reclaim_bytes += delayed_rsv->reserved;
@@ -4877,16 +4891,14 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
  
         if (__percpu_counter_compare(&space_info->total_bytes_pinned,
                                    bytes_needed,
-                                  BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
-               return -ENOSPC;
-       }
+                                  BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
+               goto enospc;
  
  commit:
-       trans = btrfs_join_transaction(fs_info->extent_root);
-       if (IS_ERR(trans))
-               return -ENOSPC;
-
         return btrfs_commit_transaction(trans);
+enospc:
+       btrfs_end_transaction(trans);
+       return -ENOSPC;
  }
  
  /*
@@ -4939,6 +4951,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
                 btrfs_end_transaction(trans);
                 break;
         case ALLOC_CHUNK:
+       case ALLOC_CHUNK_FORCE:
                 trans = btrfs_join_transaction(root);
                 if (IS_ERR(trans)) {
                         ret = PTR_ERR(trans);
@@ -4946,7 +4959,8 @@ static void flush_space(struct btrfs_fs_info *fs_info,
                 }
                 ret = do_chunk_alloc(trans,
                                      btrfs_metadata_alloc_profile(fs_info),
-                                    CHUNK_ALLOC_NO_FORCE);
+                                    (state == ALLOC_CHUNK) ?
+                                     CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE);
                 btrfs_end_transaction(trans);
                 if (ret > 0 || ret == -ENOSPC)
                         ret = 0;
@@ -4957,9 +4971,8 @@ static void flush_space(struct btrfs_fs_info *fs_info,
                  * bunch of pinned space, so make sure we run the iputs before
                  * we do our pinned bytes check below.
                  */
-               mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
                 btrfs_run_delayed_iputs(fs_info);
-               mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
+               btrfs_wait_on_delayed_iputs(fs_info);
  
                 ret = may_commit_transaction(fs_info, space_info);
                 break;
@@ -5091,6 +5104,19 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
                                 commit_cycles--;
                 }
  
+               /*
+                * We don't want to force a chunk allocation until we've tried
+                * pretty hard to reclaim space.  Think of the case where we
+                * freed up a bunch of space and so have a lot of pinned space
+                * to reclaim.  We would rather use that than possibly create a
+                * underutilized metadata chunk.  So if this is our first run
+                * through the flushing state machine skip ALLOC_CHUNK_FORCE and
+                * commit the transaction.  If nothing has changed the next go
+                * around then we can force a chunk allocation.
+                */
+               if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
+                       flush_state++;
+
                 if (flush_state > COMMIT_TRANS) {
                         commit_cycles++;
                         if (commit_cycles > 2) {
@@ -8066,6 +8092,15 @@ loop:
         return ret;
  }
  
+#define DUMP_BLOCK_RSV(fs_info, rsv_name)                              \
+do {                                                                   \
+       struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;           \
+       spin_lock(&__rsv->lock);                                        \
+       btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",      \
+                  __rsv->size, __rsv->reserved);                       \
+       spin_unlock(&__rsv->lock);                                      \
+} while (0)
+
  static void dump_space_info(struct btrfs_fs_info *fs_info,
                             struct btrfs_space_info *info, u64 bytes,
                             int dump_block_groups)
@@ -8085,6 +8120,12 @@ static void dump_space_info(struct btrfs_fs_info *fs_info,
                 info->bytes_readonly);
         spin_unlock(&info->lock);
  
+       DUMP_BLOCK_RSV(fs_info, global_block_rsv);
+       DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
+       DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+       DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
+       DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
+
         if (!dump_block_groups)
                 return;
  
@@ -8492,7 +8533,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
         clean_tree_block(fs_info, buf);
         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
  
-       btrfs_set_lock_blocking(buf);
+       btrfs_set_lock_blocking_write(buf);
         set_extent_buffer_uptodate(buf);
  
         memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
@@ -8917,7 +8958,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                 reada = 1;
         }
         btrfs_tree_lock(next);
-       btrfs_set_lock_blocking(next);
+       btrfs_set_lock_blocking_write(next);
  
         ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
                                        &wc->refs[level - 1],
@@ -8977,7 +9018,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                         return -EIO;
                 }
                 btrfs_tree_lock(next);
-               btrfs_set_lock_blocking(next);
+               btrfs_set_lock_blocking_write(next);
         }
  
         level--;
@@ -9089,7 +9130,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                 if (!path->locks[level]) {
                         BUG_ON(level == 0);
                         btrfs_tree_lock(eb);
-                       btrfs_set_lock_blocking(eb);
+                       btrfs_set_lock_blocking_write(eb);
                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
  
                         ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -9131,7 +9172,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                 if (!path->locks[level] &&
                     btrfs_header_generation(eb) == trans->transid) {
                         btrfs_tree_lock(eb);
-                       btrfs_set_lock_blocking(eb);
+                       btrfs_set_lock_blocking_write(eb);
                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
                 }
                 clean_tree_block(fs_info, eb);
@@ -9298,7 +9339,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
                 level = btrfs_header_level(root->node);
                 path->nodes[level] = btrfs_lock_root_node(root);
-               btrfs_set_lock_blocking(path->nodes[level]);
+               btrfs_set_lock_blocking_write(path->nodes[level]);
                 path->slots[level] = 0;
                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
                 memset(&wc->update_progress, 0,
@@ -9328,7 +9369,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                 level = btrfs_header_level(root->node);
                 while (1) {
                         btrfs_tree_lock(path->nodes[level]);
-                       btrfs_set_lock_blocking(path->nodes[level]);
+                       btrfs_set_lock_blocking_write(path->nodes[level]);
                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
  
                         ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -9595,6 +9636,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
  {
         struct btrfs_space_info *sinfo = cache->space_info;
         u64 num_bytes;
+       u64 sinfo_used;
         u64 min_allocable_bytes;
         int ret = -ENOSPC;
  
@@ -9621,9 +9663,10 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
  
         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
                     cache->bytes_super - btrfs_block_group_used(&cache->item);
+       sinfo_used = btrfs_space_info_used(sinfo, true);
  
-       if (btrfs_space_info_used(sinfo, true) + num_bytes +
-           min_allocable_bytes <= sinfo->total_bytes) {
+       if (sinfo_used + num_bytes + min_allocable_bytes <=
+           sinfo->total_bytes) {
                 sinfo->bytes_readonly += num_bytes;
                 cache->ro++;
                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
@@ -9632,6 +9675,15 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
  out:
         spin_unlock(&cache->lock);
         spin_unlock(&sinfo->lock);
+       if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
+               btrfs_info(cache->fs_info,
+                       "unable to make block group %llu ro",
+                       cache->key.objectid);
+               btrfs_info(cache->fs_info,
+                       "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
+                       sinfo_used, num_bytes, min_allocable_bytes);
+               dump_space_info(cache->fs_info, cache->space_info, 0, 0);
+       }
         return ret;
  }