Merge tag 'for-6.6-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave...
[sfrench/cifs-2.6.git] / fs / btrfs / delayed-inode.c
index 53c1211dd60bab23b4574820a1b3a70f2b2c8c15..caf0bbd028d11070ea8d87524c9e8c670e7d905d 100644 (file)
@@ -412,6 +412,7 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root)
 
 static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
 {
+       struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node;
        struct rb_root_cached *root;
        struct btrfs_delayed_root *delayed_root;
 
@@ -419,18 +420,21 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
        if (RB_EMPTY_NODE(&delayed_item->rb_node))
                return;
 
-       delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
+       /* If it's in a rbtree, then we need to have delayed node locked. */
+       lockdep_assert_held(&delayed_node->mutex);
+
+       delayed_root = delayed_node->root->fs_info->delayed_root;
 
        BUG_ON(!delayed_root);
 
        if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
-               root = &delayed_item->delayed_node->ins_root;
+               root = &delayed_node->ins_root;
        else
-               root = &delayed_item->delayed_node->del_root;
+               root = &delayed_node->del_root;
 
        rb_erase_cached(&delayed_item->rb_node, root);
        RB_CLEAR_NODE(&delayed_item->rb_node);
-       delayed_item->delayed_node->count--;
+       delayed_node->count--;
 
        finish_one_item(delayed_root);
 }
@@ -1153,20 +1157,33 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
                ret = __btrfs_commit_inode_delayed_items(trans, path,
                                                         curr_node);
                if (ret) {
-                       btrfs_release_delayed_node(curr_node);
-                       curr_node = NULL;
                        btrfs_abort_transaction(trans, ret);
                        break;
                }
 
                prev_node = curr_node;
                curr_node = btrfs_next_delayed_node(curr_node);
+               /*
+                * See the comment below about releasing path before releasing
+                * node. If the commit of delayed items was successful the path
+                * should always be released, but in case of an error, it may
+                * point to locked extent buffers (a leaf at the very least).
+                */
+               ASSERT(path->nodes[0] == NULL);
                btrfs_release_delayed_node(prev_node);
        }
 
+       /*
+        * Release the path to avoid a potential deadlock and lockdep splat when
+        * releasing the delayed node, as that requires taking the delayed node's
+        * mutex. If another task starts running delayed items before we take
+        * the mutex, it will first lock the mutex and then it may try to lock
+        * the same btree path (leaf).
+        */
+       btrfs_free_path(path);
+
        if (curr_node)
                btrfs_release_delayed_node(curr_node);
-       btrfs_free_path(path);
        trans->block_rsv = block_rsv;
 
        return ret;
@@ -1413,7 +1430,29 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
        btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH);
 }
 
-/* Will return 0 or -ENOMEM */
+static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans)
+{
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+       const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+
+       if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+               return;
+
+       /*
+        * Adding the new dir index item does not require touching another
+        * leaf, so we can release 1 unit of metadata that was previously
+        * reserved when starting the transaction. This applies only to
+        * the case where we had a transaction start and excludes the
+        * transaction join case (when replaying log trees).
+        */
+       trace_btrfs_space_reservation(fs_info, "transaction",
+                                     trans->transid, bytes, 0);
+       btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL);
+       ASSERT(trans->bytes_reserved >= bytes);
+       trans->bytes_reserved -= bytes;
+}
+
+/* Will return 0, -ENOMEM or -EEXIST (index number collision, unexpected). */
 int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
                                   const char *name, int name_len,
                                   struct btrfs_inode *dir,
@@ -1455,6 +1494,27 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 
        mutex_lock(&delayed_node->mutex);
 
+       /*
+        * First attempt to insert the delayed item. This is to make the error
+        * handling path simpler in case we fail (-EEXIST). There's no risk of
+        * any other task coming in and running the delayed item before we do
+        * the metadata space reservation below, because we are holding the
+        * delayed node's mutex and that mutex must also be locked before the
+        * node's delayed items can be run.
+        */
+       ret = __btrfs_add_delayed_item(delayed_node, delayed_item);
+       if (unlikely(ret)) {
+               btrfs_err(trans->fs_info,
+"error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d",
+                         name_len, name, index, btrfs_root_id(delayed_node->root),
+                         delayed_node->inode_id, dir->index_cnt,
+                         delayed_node->index_cnt, ret);
+               btrfs_release_delayed_item(delayed_item);
+               btrfs_release_dir_index_item_space(trans);
+               mutex_unlock(&delayed_node->mutex);
+               goto release_node;
+       }
+
        if (delayed_node->index_item_leaves == 0 ||
            delayed_node->curr_index_batch_size + data_len > leaf_data_size) {
                delayed_node->curr_index_batch_size = data_len;
@@ -1472,36 +1532,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
                 * impossible.
                 */
                if (WARN_ON(ret)) {
-                       mutex_unlock(&delayed_node->mutex);
                        btrfs_release_delayed_item(delayed_item);
+                       mutex_unlock(&delayed_node->mutex);
                        goto release_node;
                }
 
                delayed_node->index_item_leaves++;
-       } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
-               const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
-
-               /*
-                * Adding the new dir index item does not require touching another
-                * leaf, so we can release 1 unit of metadata that was previously
-                * reserved when starting the transaction. This applies only to
-                * the case where we had a transaction start and excludes the
-                * transaction join case (when replaying log trees).
-                */
-               trace_btrfs_space_reservation(fs_info, "transaction",
-                                             trans->transid, bytes, 0);
-               btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL);
-               ASSERT(trans->bytes_reserved >= bytes);
-               trans->bytes_reserved -= bytes;
-       }
-
-       ret = __btrfs_add_delayed_item(delayed_node, delayed_item);
-       if (unlikely(ret)) {
-               btrfs_err(trans->fs_info,
-                         "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
-                         name_len, name, delayed_node->root->root_key.objectid,
-                         delayed_node->inode_id, ret);
-               BUG();
+       } else {
+               btrfs_release_dir_index_item_space(trans);
        }
        mutex_unlock(&delayed_node->mutex);