Btrfs: fix fsync after succession of renames and unlink/rmdir
[sfrench/cifs-2.6.git] / fs / btrfs / tree-log.c
index ac232b3d6d7e2f3ba34d991ef8b2a17f96588f13..e76345f52beb52b822491ebbef233103fd65afef 100644 (file)
@@ -27,6 +27,7 @@
 #define LOG_INODE_ALL 0
 #define LOG_INODE_EXISTS 1
 #define LOG_OTHER_INODE 2
+#define LOG_OTHER_INODE_ALL 3
 
 /*
  * directory trouble cases
@@ -1330,6 +1331,67 @@ out:
        return ret;
 }
 
+static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                   struct inode *dir, struct inode *inode, const char *name,
+                   int namelen, u64 ref_index)
+{
+       struct btrfs_dir_item *dir_item;
+       struct btrfs_key key;
+       struct btrfs_path *path;
+       struct inode *other_inode = NULL;
+       int ret;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       dir_item = btrfs_lookup_dir_item(NULL, root, path,
+                                        btrfs_ino(BTRFS_I(dir)),
+                                        name, namelen, 0);
+       if (!dir_item) {
+               btrfs_release_path(path);
+               goto add_link;
+       } else if (IS_ERR(dir_item)) {
+               ret = PTR_ERR(dir_item);
+               goto out;
+       }
+
+       /*
+        * Our inode's dentry collides with the dentry of another inode which is
+        * in the log but not yet processed since it has a higher inode number.
+        * So delete that other dentry.
+        */
+       btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
+       btrfs_release_path(path);
+       other_inode = read_one_inode(root, key.objectid);
+       if (!other_inode) {
+               ret = -ENOENT;
+               goto out;
+       }
+       ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
+                                name, namelen);
+       if (ret)
+               goto out;
+       /*
+        * If we dropped the link count to 0, bump it so that later the iput()
+        * on the inode will not free it. We will fixup the link count later.
+        */
+       if (other_inode->i_nlink == 0)
+               inc_nlink(other_inode);
+
+       ret = btrfs_run_delayed_items(trans);
+       if (ret)
+               goto out;
+add_link:
+       ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+                            name, namelen, 0, ref_index);
+out:
+       iput(other_inode);
+       btrfs_free_path(path);
+
+       return ret;
+}
+
 /*
  * replay one inode back reference item found in the log tree.
  * eb, slot and key refer to the buffer and key found in the log tree.
@@ -1466,9 +1528,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                                goto out;
 
                        /* insert our name */
-                       ret = btrfs_add_link(trans, BTRFS_I(dir),
-                                       BTRFS_I(inode),
-                                       name, namelen, 0, ref_index);
+                       ret = add_link(trans, root, dir, inode, name, namelen,
+                                      ref_index);
                        if (ret)
                                goto out;
 
@@ -2663,7 +2724,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
                                if (trans) {
                                        btrfs_tree_lock(next);
-                                       btrfs_set_lock_blocking(next);
+                                       btrfs_set_lock_blocking_write(next);
                                        clean_tree_block(fs_info, next);
                                        btrfs_wait_tree_block_writeback(next);
                                        btrfs_tree_unlock(next);
@@ -2747,7 +2808,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 
                                if (trans) {
                                        btrfs_tree_lock(next);
-                                       btrfs_set_lock_blocking(next);
+                                       btrfs_set_lock_blocking_write(next);
                                        clean_tree_block(fs_info, next);
                                        btrfs_wait_tree_block_writeback(next);
                                        btrfs_tree_unlock(next);
@@ -2829,7 +2890,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 
                        if (trans) {
                                btrfs_tree_lock(next);
-                               btrfs_set_lock_blocking(next);
+                               btrfs_set_lock_blocking_write(next);
                                clean_tree_block(fs_info, next);
                                btrfs_wait_tree_block_writeback(next);
                                btrfs_tree_unlock(next);
@@ -4717,7 +4778,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
                                         const int slot,
                                         const struct btrfs_key *key,
                                         struct btrfs_inode *inode,
-                                        u64 *other_ino)
+                                        u64 *other_ino, u64 *other_parent)
 {
        int ret;
        struct btrfs_path *search_path;
@@ -4780,8 +4841,13 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
                        btrfs_dir_item_key_to_cpu(search_path->nodes[0],
                                                  di, &di_key);
                        if (di_key.type == BTRFS_INODE_ITEM_KEY) {
-                               ret = 1;
-                               *other_ino = di_key.objectid;
+                               if (di_key.objectid != key->objectid) {
+                                       ret = 1;
+                                       *other_ino = di_key.objectid;
+                                       *other_parent = parent;
+                               } else {
+                                       ret = 0;
+                               }
                        } else {
                                ret = -EAGAIN;
                        }
@@ -4801,6 +4867,144 @@ out:
        return ret;
 }
 
+struct btrfs_ino_list {
+       u64 ino;
+       u64 parent;
+       struct list_head list;
+};
+
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct btrfs_log_ctx *ctx,
+                                 u64 ino, u64 parent)
+{
+       struct btrfs_ino_list *ino_elem;
+       LIST_HEAD(inode_list);
+       int ret = 0;
+
+       ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+       if (!ino_elem)
+               return -ENOMEM;
+       ino_elem->ino = ino;
+       ino_elem->parent = parent;
+       list_add_tail(&ino_elem->list, &inode_list);
+
+       while (!list_empty(&inode_list)) {
+               struct btrfs_fs_info *fs_info = root->fs_info;
+               struct btrfs_key key;
+               struct inode *inode;
+
+               ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
+                                           list);
+               ino = ino_elem->ino;
+               parent = ino_elem->parent;
+               list_del(&ino_elem->list);
+               kfree(ino_elem);
+               if (ret)
+                       continue;
+
+               btrfs_release_path(path);
+
+               key.objectid = ino;
+               key.type = BTRFS_INODE_ITEM_KEY;
+               key.offset = 0;
+               inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+               /*
+                * If the other inode that had a conflicting dir entry was
+                * deleted in the current transaction, we need to log its parent
+                * directory.
+                */
+               if (IS_ERR(inode)) {
+                       ret = PTR_ERR(inode);
+                       if (ret == -ENOENT) {
+                               key.objectid = parent;
+                               inode = btrfs_iget(fs_info->sb, &key, root,
+                                                  NULL);
+                               if (IS_ERR(inode)) {
+                                       ret = PTR_ERR(inode);
+                               } else {
+                                       ret = btrfs_log_inode(trans, root,
+                                                     BTRFS_I(inode),
+                                                     LOG_OTHER_INODE_ALL,
+                                                     0, LLONG_MAX, ctx);
+                                       iput(inode);
+                               }
+                       }
+                       continue;
+               }
+               /*
+                * We are safe logging the other inode without acquiring its
+                * lock as long as we log with the LOG_INODE_EXISTS mode. We
+                * are safe against concurrent renames of the other inode as
+                * well because during a rename we pin the log and update the
+                * log with the new name before we unpin it.
+                */
+               ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+                                     LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
+               if (ret) {
+                       iput(inode);
+                       continue;
+               }
+
+               key.objectid = ino;
+               key.type = BTRFS_INODE_REF_KEY;
+               key.offset = 0;
+               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+               if (ret < 0) {
+                       iput(inode);
+                       continue;
+               }
+
+               while (true) {
+                       struct extent_buffer *leaf = path->nodes[0];
+                       int slot = path->slots[0];
+                       u64 other_ino = 0;
+                       u64 other_parent = 0;
+
+                       if (slot >= btrfs_header_nritems(leaf)) {
+                               ret = btrfs_next_leaf(root, path);
+                               if (ret < 0) {
+                                       break;
+                               } else if (ret > 0) {
+                                       ret = 0;
+                                       break;
+                               }
+                               continue;
+                       }
+
+                       btrfs_item_key_to_cpu(leaf, &key, slot);
+                       if (key.objectid != ino ||
+                           (key.type != BTRFS_INODE_REF_KEY &&
+                            key.type != BTRFS_INODE_EXTREF_KEY)) {
+                               ret = 0;
+                               break;
+                       }
+
+                       ret = btrfs_check_ref_name_override(leaf, slot, &key,
+                                       BTRFS_I(inode), &other_ino,
+                                       &other_parent);
+                       if (ret < 0)
+                               break;
+                       if (ret > 0) {
+                               ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+                               if (!ino_elem) {
+                                       ret = -ENOMEM;
+                                       break;
+                               }
+                               ino_elem->ino = other_ino;
+                               ino_elem->parent = other_parent;
+                               list_add_tail(&ino_elem->list, &inode_list);
+                               ret = 0;
+                       }
+                       path->slots[0]++;
+               }
+               iput(inode);
+       }
+
+       return ret;
+}
+
 /* log a single inode in the tree log.
  * At least one parent directory for this inode must exist in the tree
  * or be logged already.
@@ -4840,6 +5044,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        u64 logged_isize = 0;
        bool need_log_inode_item = true;
        bool xattrs_logged = false;
+       bool recursive_logging = false;
 
        path = btrfs_alloc_path();
        if (!path)
@@ -4885,8 +5090,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                return ret;
        }
 
-       if (inode_only == LOG_OTHER_INODE) {
-               inode_only = LOG_INODE_EXISTS;
+       if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
+               recursive_logging = true;
+               if (inode_only == LOG_OTHER_INODE)
+                       inode_only = LOG_INODE_EXISTS;
+               else
+                       inode_only = LOG_INODE_ALL;
                mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
        } else {
                mutex_lock(&inode->log_mutex);
@@ -4981,20 +5190,19 @@ again:
 
                if ((min_key.type == BTRFS_INODE_REF_KEY ||
                     min_key.type == BTRFS_INODE_EXTREF_KEY) &&
-                   inode->generation == trans->transid) {
+                   inode->generation == trans->transid &&
+                   !recursive_logging) {
                        u64 other_ino = 0;
+                       u64 other_parent = 0;
 
                        ret = btrfs_check_ref_name_override(path->nodes[0],
                                        path->slots[0], &min_key, inode,
-                                       &other_ino);
+                                       &other_ino, &other_parent);
                        if (ret < 0) {
                                err = ret;
                                goto out_unlock;
                        } else if (ret > 0 && ctx &&
                                   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
-                               struct btrfs_key inode_key;
-                               struct inode *other_inode;
-
                                if (ins_nr > 0) {
                                        ins_nr++;
                                } else {
@@ -5010,43 +5218,13 @@ again:
                                        goto out_unlock;
                                }
                                ins_nr = 0;
-                               btrfs_release_path(path);
-                               inode_key.objectid = other_ino;
-                               inode_key.type = BTRFS_INODE_ITEM_KEY;
-                               inode_key.offset = 0;
-                               other_inode = btrfs_iget(fs_info->sb,
-                                                        &inode_key, root,
-                                                        NULL);
-                               /*
-                                * If the other inode that had a conflicting dir
-                                * entry was deleted in the current transaction,
-                                * we don't need to do more work nor fallback to
-                                * a transaction commit.
-                                */
-                               if (other_inode == ERR_PTR(-ENOENT)) {
-                                       goto next_key;
-                               } else if (IS_ERR(other_inode)) {
-                                       err = PTR_ERR(other_inode);
-                                       goto out_unlock;
-                               }
-                               /*
-                                * We are safe logging the other inode without
-                                * acquiring its i_mutex as long as we log with
-                                * the LOG_INODE_EXISTS mode. We're safe against
-                                * concurrent renames of the other inode as well
-                                * because during a rename we pin the log and
-                                * update the log with the new name before we
-                                * unpin it.
-                                */
-                               err = btrfs_log_inode(trans, root,
-                                               BTRFS_I(other_inode),
-                                               LOG_OTHER_INODE, 0, LLONG_MAX,
-                                               ctx);
-                               iput(other_inode);
+
+                               err = log_conflicting_inodes(trans, root, path,
+                                               ctx, other_ino, other_parent);
                                if (err)
                                        goto out_unlock;
-                               else
-                                       goto next_key;
+                               btrfs_release_path(path);
+                               goto next_key;
                        }
                }