btrfs: don't use global reserve for chunk allocation
[sfrench/cifs-2.6.git] / fs / btrfs / extent-tree.c
index d81035b7ea7d597e229975691fef48f8813a908a..d637f4c4bcd076138414e9ec779785d808eb8053 100644 (file)
@@ -3013,8 +3013,7 @@ again:
        }
 
        if (run_all) {
-               if (!list_empty(&trans->new_bgs))
-                       btrfs_create_pending_block_groups(trans);
+               btrfs_create_pending_block_groups(trans);
 
                spin_lock(&delayed_refs->lock);
                node = rb_first_cached(&delayed_refs->href_root);
@@ -4280,10 +4279,14 @@ commit_trans:
                                /*
                                 * The cleaner kthread might still be doing iput
                                 * operations. Wait for it to finish so that
-                                * more space is released.
+                                * more space is released.  We don't need to
+                                * explicitly run the delayed iputs here because
+                                * the commit_transaction would have woken up
+                                * the cleaner.
                                 */
-                               mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
-                               mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
+                               ret = btrfs_wait_on_delayed_iputs(fs_info);
+                               if (ret)
+                                       return ret;
                                goto again;
                        } else {
                                btrfs_end_transaction(trans);
@@ -4396,21 +4399,12 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
                              struct btrfs_space_info *sinfo, int force)
 {
-       struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
        u64 bytes_used = btrfs_space_info_used(sinfo, false);
        u64 thresh;
 
        if (force == CHUNK_ALLOC_FORCE)
                return 1;
 
-       /*
-        * We need to take into account the global rsv because for all intents
-        * and purposes it's used space.  Don't worry about locking the
-        * global_rsv, it doesn't change except when the transaction commits.
-        */
-       if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
-               bytes_used += calc_global_rsv_need_space(global_rsv);
-
        /*
         * in limited mode, we want to have some free space up to
         * about 1% of the FS size.
@@ -4741,7 +4735,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
        struct btrfs_space_info *space_info;
        struct btrfs_trans_handle *trans;
        u64 delalloc_bytes;
-       u64 max_reclaim;
+       u64 async_pages;
        u64 items;
        long time_left;
        unsigned long nr_pages;
@@ -4766,25 +4760,36 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 
        loops = 0;
        while (delalloc_bytes && loops < 3) {
-               max_reclaim = min(delalloc_bytes, to_reclaim);
-               nr_pages = max_reclaim >> PAGE_SHIFT;
+               nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+
+               /*
+                * Triggers inode writeback for up to nr_pages. This will invoke
+                * ->writepages callback and trigger delalloc filling
+                *  (btrfs_run_delalloc_range()).
+                */
                btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
+
                /*
-                * We need to wait for the async pages to actually start before
-                * we do anything.
+                * We need to wait for the compressed pages to start before
+                * we continue.
                 */
-               max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
-               if (!max_reclaim)
+               async_pages = atomic_read(&fs_info->async_delalloc_pages);
+               if (!async_pages)
                        goto skip_async;
 
-               if (max_reclaim <= nr_pages)
-                       max_reclaim = 0;
+               /*
+                * Calculate how many compressed pages we want to be written
+                * before we continue. I.e if there are more async pages than we
+                * require wait_event will wait until nr_pages are written.
+                */
+               if (async_pages <= nr_pages)
+                       async_pages = 0;
                else
-                       max_reclaim -= nr_pages;
+                       async_pages -= nr_pages;
 
                wait_event(fs_info->async_submit_wait,
                           atomic_read(&fs_info->async_delalloc_pages) <=
-                          (int)max_reclaim);
+                          (int)async_pages);
 skip_async:
                spin_lock(&space_info->lock);
                if (list_empty(&space_info->tickets) &&
@@ -4851,10 +4856,19 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
        if (!bytes_needed)
                return 0;
 
-       /* See if there is enough pinned space to make this reservation */
-       if (__percpu_counter_compare(&space_info->total_bytes_pinned,
-                                  bytes_needed,
-                                  BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
+       trans = btrfs_join_transaction(fs_info->extent_root);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
+
+       /*
+        * See if there is enough pinned space to make this reservation, or if
+        * we have block groups that are going to be freed, allowing us to
+        * possibly do a chunk allocation the next loop through.
+        */
+       if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
+           __percpu_counter_compare(&space_info->total_bytes_pinned,
+                                    bytes_needed,
+                                    BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
                goto commit;
 
        /*
@@ -4862,7 +4876,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
         * this reservation.
         */
        if (space_info != delayed_rsv->space_info)
-               return -ENOSPC;
+               goto enospc;
 
        spin_lock(&delayed_rsv->lock);
        reclaim_bytes += delayed_rsv->reserved;
@@ -4877,16 +4891,14 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
 
        if (__percpu_counter_compare(&space_info->total_bytes_pinned,
                                   bytes_needed,
-                                  BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
-               return -ENOSPC;
-       }
+                                  BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
+               goto enospc;
 
 commit:
-       trans = btrfs_join_transaction(fs_info->extent_root);
-       if (IS_ERR(trans))
-               return -ENOSPC;
-
        return btrfs_commit_transaction(trans);
+enospc:
+       btrfs_end_transaction(trans);
+       return -ENOSPC;
 }
 
 /*
@@ -4939,6 +4951,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
                btrfs_end_transaction(trans);
                break;
        case ALLOC_CHUNK:
+       case ALLOC_CHUNK_FORCE:
                trans = btrfs_join_transaction(root);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
@@ -4946,7 +4959,8 @@ static void flush_space(struct btrfs_fs_info *fs_info,
                }
                ret = do_chunk_alloc(trans,
                                     btrfs_metadata_alloc_profile(fs_info),
-                                    CHUNK_ALLOC_NO_FORCE);
+                                    (state == ALLOC_CHUNK) ?
+                                     CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE);
                btrfs_end_transaction(trans);
                if (ret > 0 || ret == -ENOSPC)
                        ret = 0;
@@ -4957,9 +4971,8 @@ static void flush_space(struct btrfs_fs_info *fs_info,
                 * bunch of pinned space, so make sure we run the iputs before
                 * we do our pinned bytes check below.
                 */
-               mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
                btrfs_run_delayed_iputs(fs_info);
-               mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
+               btrfs_wait_on_delayed_iputs(fs_info);
 
                ret = may_commit_transaction(fs_info, space_info);
                break;
@@ -5091,6 +5104,19 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
                                commit_cycles--;
                }
 
+               /*
+                * We don't want to force a chunk allocation until we've tried
+                * pretty hard to reclaim space.  Think of the case where we
+                * freed up a bunch of space and so have a lot of pinned space
+                * to reclaim.  We would rather use that than possibly create a
+                * underutilized metadata chunk.  So if this is our first run
+                * through the flushing state machine skip ALLOC_CHUNK_FORCE and
+                * commit the transaction.  If nothing has changed the next go
+                * around then we can force a chunk allocation.
+                */
+               if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
+                       flush_state++;
+
                if (flush_state > COMMIT_TRANS) {
                        commit_cycles++;
                        if (commit_cycles > 2) {
@@ -8066,6 +8092,15 @@ loop:
        return ret;
 }
 
+#define DUMP_BLOCK_RSV(fs_info, rsv_name)                              \
+do {                                                                   \
+       struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;           \
+       spin_lock(&__rsv->lock);                                        \
+       btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",      \
+                  __rsv->size, __rsv->reserved);                       \
+       spin_unlock(&__rsv->lock);                                      \
+} while (0)
+
 static void dump_space_info(struct btrfs_fs_info *fs_info,
                            struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups)
@@ -8085,6 +8120,12 @@ static void dump_space_info(struct btrfs_fs_info *fs_info,
                info->bytes_readonly);
        spin_unlock(&info->lock);
 
+       DUMP_BLOCK_RSV(fs_info, global_block_rsv);
+       DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
+       DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+       DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
+       DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
+
        if (!dump_block_groups)
                return;
 
@@ -8492,7 +8533,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        clean_tree_block(fs_info, buf);
        clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
 
-       btrfs_set_lock_blocking(buf);
+       btrfs_set_lock_blocking_write(buf);
        set_extent_buffer_uptodate(buf);
 
        memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
@@ -8917,7 +8958,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                reada = 1;
        }
        btrfs_tree_lock(next);
-       btrfs_set_lock_blocking(next);
+       btrfs_set_lock_blocking_write(next);
 
        ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
                                       &wc->refs[level - 1],
@@ -8977,7 +9018,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                        return -EIO;
                }
                btrfs_tree_lock(next);
-               btrfs_set_lock_blocking(next);
+               btrfs_set_lock_blocking_write(next);
        }
 
        level--;
@@ -9089,7 +9130,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                if (!path->locks[level]) {
                        BUG_ON(level == 0);
                        btrfs_tree_lock(eb);
-                       btrfs_set_lock_blocking(eb);
+                       btrfs_set_lock_blocking_write(eb);
                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 
                        ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -9131,7 +9172,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                if (!path->locks[level] &&
                    btrfs_header_generation(eb) == trans->transid) {
                        btrfs_tree_lock(eb);
-                       btrfs_set_lock_blocking(eb);
+                       btrfs_set_lock_blocking_write(eb);
                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
                }
                clean_tree_block(fs_info, eb);
@@ -9298,7 +9339,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
                level = btrfs_header_level(root->node);
                path->nodes[level] = btrfs_lock_root_node(root);
-               btrfs_set_lock_blocking(path->nodes[level]);
+               btrfs_set_lock_blocking_write(path->nodes[level]);
                path->slots[level] = 0;
                path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
                memset(&wc->update_progress, 0,
@@ -9328,7 +9369,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                level = btrfs_header_level(root->node);
                while (1) {
                        btrfs_tree_lock(path->nodes[level]);
-                       btrfs_set_lock_blocking(path->nodes[level]);
+                       btrfs_set_lock_blocking_write(path->nodes[level]);
                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 
                        ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -9595,6 +9636,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 {
        struct btrfs_space_info *sinfo = cache->space_info;
        u64 num_bytes;
+       u64 sinfo_used;
        u64 min_allocable_bytes;
        int ret = -ENOSPC;
 
@@ -9621,9 +9663,10 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 
        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
                    cache->bytes_super - btrfs_block_group_used(&cache->item);
+       sinfo_used = btrfs_space_info_used(sinfo, true);
 
-       if (btrfs_space_info_used(sinfo, true) + num_bytes +
-           min_allocable_bytes <= sinfo->total_bytes) {
+       if (sinfo_used + num_bytes + min_allocable_bytes <=
+           sinfo->total_bytes) {
                sinfo->bytes_readonly += num_bytes;
                cache->ro++;
                list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
@@ -9632,6 +9675,15 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 out:
        spin_unlock(&cache->lock);
        spin_unlock(&sinfo->lock);
+       if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
+               btrfs_info(cache->fs_info,
+                       "unable to make block group %llu ro",
+                       cache->key.objectid);
+               btrfs_info(cache->fs_info,
+                       "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
+                       sinfo_used, num_bytes, min_allocable_bytes);
+               dump_space_info(cache->fs_info, cache->space_info, 0, 0);
+       }
        return ret;
 }