Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[sfrench/cifs-2.6.git] / fs / btrfs / tree-log.c
index 25db71be462a12dc9649430f6c9b9dd9f0051d4f..d04968374e9d8bd5b118024d53ea0a5bfa6d0bd2 100644 (file)
@@ -492,11 +492,19 @@ insert:
 
                if (btrfs_inode_generation(eb, src_item) == 0) {
                        struct extent_buffer *dst_eb = path->nodes[0];
+                       const u64 ino_size = btrfs_inode_size(eb, src_item);
 
+                       /*
+                        * For regular files an ino_size == 0 is used only when
+                        * logging that an inode exists, as part of a directory
+                        * fsync, and the inode wasn't fsynced before. In this
+                        * case don't set the size of the inode in the fs/subvol
+                        * tree, otherwise we would be throwing valid data away.
+                        */
                        if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
-                           S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
+                           S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
+                           ino_size != 0) {
                                struct btrfs_map_token token;
-                               u64 ino_size = btrfs_inode_size(eb, src_item);
 
                                btrfs_init_map_token(&token);
                                btrfs_set_token_inode_size(dst_eb, dst_item,
@@ -1951,6 +1959,104 @@ out:
        return ret;
 }
 
+static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct btrfs_root *log,
+                             struct btrfs_path *path,
+                             const u64 ino)
+{
+       struct btrfs_key search_key;
+       struct btrfs_path *log_path;
+       int i;
+       int nritems;
+       int ret;
+
+       log_path = btrfs_alloc_path();
+       if (!log_path)
+               return -ENOMEM;
+
+       search_key.objectid = ino;
+       search_key.type = BTRFS_XATTR_ITEM_KEY;
+       search_key.offset = 0;
+again:
+       ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+process_leaf:
+       nritems = btrfs_header_nritems(path->nodes[0]);
+       for (i = path->slots[0]; i < nritems; i++) {
+               struct btrfs_key key;
+               struct btrfs_dir_item *di;
+               struct btrfs_dir_item *log_di;
+               u32 total_size;
+               u32 cur;
+
+               btrfs_item_key_to_cpu(path->nodes[0], &key, i);
+               if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
+                       ret = 0;
+                       goto out;
+               }
+
+               di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
+               total_size = btrfs_item_size_nr(path->nodes[0], i);
+               cur = 0;
+               while (cur < total_size) {
+                       u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
+                       u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
+                       u32 this_len = sizeof(*di) + name_len + data_len;
+                       char *name;
+
+                       name = kmalloc(name_len, GFP_NOFS);
+                       if (!name) {
+                               ret = -ENOMEM;
+                               goto out;
+                       }
+                       read_extent_buffer(path->nodes[0], name,
+                                          (unsigned long)(di + 1), name_len);
+
+                       log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
+                                                   name, name_len, 0);
+                       btrfs_release_path(log_path);
+                       if (!log_di) {
+                               /* Doesn't exist in log tree, so delete it. */
+                               btrfs_release_path(path);
+                               di = btrfs_lookup_xattr(trans, root, path, ino,
+                                                       name, name_len, -1);
+                               kfree(name);
+                               if (IS_ERR(di)) {
+                                       ret = PTR_ERR(di);
+                                       goto out;
+                               }
+                               ASSERT(di);
+                               ret = btrfs_delete_one_dir_name(trans, root,
+                                                               path, di);
+                               if (ret)
+                                       goto out;
+                               btrfs_release_path(path);
+                               search_key = key;
+                               goto again;
+                       }
+                       kfree(name);
+                       if (IS_ERR(log_di)) {
+                               ret = PTR_ERR(log_di);
+                               goto out;
+                       }
+                       cur += this_len;
+                       di = (struct btrfs_dir_item *)((char *)di + this_len);
+               }
+       }
+       ret = btrfs_next_leaf(root, path);
+       if (ret > 0)
+               ret = 0;
+       else if (ret == 0)
+               goto process_leaf;
+out:
+       btrfs_free_path(log_path);
+       btrfs_release_path(path);
+       return ret;
+}
+
+
 /*
  * deletion replay happens before we copy any new directory items
  * out of the log or out of backreferences from inodes.  It
@@ -2104,6 +2210,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 
                        inode_item = btrfs_item_ptr(eb, i,
                                            struct btrfs_inode_item);
+                       ret = replay_xattr_deletes(wc->trans, root, log,
+                                                  path, key.objectid);
+                       if (ret)
+                               break;
                        mode = btrfs_inode_mode(eb, inode_item);
                        if (S_ISDIR(mode)) {
                                ret = replay_dir_deletes(wc->trans,
@@ -2230,7 +2340,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                                if (trans) {
                                        btrfs_tree_lock(next);
                                        btrfs_set_lock_blocking(next);
-                                       clean_tree_block(trans, root, next);
+                                       clean_tree_block(trans, root->fs_info,
+                                                       next);
                                        btrfs_wait_tree_block_writeback(next);
                                        btrfs_tree_unlock(next);
                                }
@@ -2308,7 +2419,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
                                if (trans) {
                                        btrfs_tree_lock(next);
                                        btrfs_set_lock_blocking(next);
-                                       clean_tree_block(trans, root, next);
+                                       clean_tree_block(trans, root->fs_info,
+                                                       next);
                                        btrfs_wait_tree_block_writeback(next);
                                        btrfs_tree_unlock(next);
                                }
@@ -2384,7 +2496,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
                        if (trans) {
                                btrfs_tree_lock(next);
                                btrfs_set_lock_blocking(next);
-                               clean_tree_block(trans, log, next);
+                               clean_tree_block(trans, log->fs_info, next);
                                btrfs_wait_tree_block_writeback(next);
                                btrfs_tree_unlock(next);
                        }
@@ -3020,6 +3132,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct inode *inode,
                          struct btrfs_path *path,
                          struct btrfs_path *dst_path, int key_type,
+                         struct btrfs_log_ctx *ctx,
                          u64 min_offset, u64 *last_offset_ret)
 {
        struct btrfs_key min_key;
@@ -3104,6 +3217,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                src = path->nodes[0];
                nritems = btrfs_header_nritems(src);
                for (i = path->slots[0]; i < nritems; i++) {
+                       struct btrfs_dir_item *di;
+
                        btrfs_item_key_to_cpu(src, &min_key, i);
 
                        if (min_key.objectid != ino || min_key.type != key_type)
@@ -3114,6 +3229,37 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                                err = ret;
                                goto done;
                        }
+
+                       /*
+                        * We must make sure that when we log a directory entry,
+                        * the corresponding inode, after log replay, has a
+                        * matching link count. For example:
+                        *
+                        * touch foo
+                        * mkdir mydir
+                        * sync
+                        * ln foo mydir/bar
+                        * xfs_io -c "fsync" mydir
+                        * <crash>
+                        * <mount fs and log replay>
+                        *
+                        * Would result in a fsync log that when replayed, our
+                        * file inode would have a link count of 1, but we get
+                        * two directory entries pointing to the same inode.
+                        * After removing one of the names, it would not be
+                        * possible to remove the other name, which resulted
+                        * always in stale file handle errors, and would not
+                        * be possible to rmdir the parent directory, since
+                        * its i_size could never decrement to the value
+                        * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
+                        */
+                       di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+                       btrfs_dir_item_key_to_cpu(src, di, &tmp);
+                       if (ctx &&
+                           (btrfs_dir_transid(src, di) == trans->transid ||
+                            btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
+                           tmp.type != BTRFS_ROOT_ITEM_KEY)
+                               ctx->log_new_dentries = true;
                }
                path->slots[0] = nritems;
 
@@ -3175,7 +3321,8 @@ done:
 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct inode *inode,
                          struct btrfs_path *path,
-                         struct btrfs_path *dst_path)
+                         struct btrfs_path *dst_path,
+                         struct btrfs_log_ctx *ctx)
 {
        u64 min_key;
        u64 max_key;
@@ -3187,7 +3334,7 @@ again:
        max_key = 0;
        while (1) {
                ret = log_dir_items(trans, root, inode, path,
-                                   dst_path, key_type, min_key,
+                                   dst_path, key_type, ctx, min_key,
                                    &max_key);
                if (ret)
                        return ret;
@@ -3963,7 +4110,7 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
        if (ret < 0) {
                return ret;
        } else if (ret > 0) {
-               *size_ret = i_size_read(inode);
+               *size_ret = 0;
        } else {
                struct btrfs_inode_item *item;
 
@@ -4070,10 +4217,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        if (S_ISDIR(inode->i_mode)) {
                int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
 
-               if (inode_only == LOG_INODE_EXISTS) {
-                       max_key_type = BTRFS_INODE_EXTREF_KEY;
-                       max_key.type = max_key_type;
-               }
+               if (inode_only == LOG_INODE_EXISTS)
+                       max_key_type = BTRFS_XATTR_ITEM_KEY;
                ret = drop_objectid_items(trans, log, path, ino, max_key_type);
        } else {
                if (inode_only == LOG_INODE_EXISTS) {
@@ -4098,7 +4243,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                             &BTRFS_I(inode)->runtime_flags)) {
                        if (inode_only == LOG_INODE_EXISTS) {
-                               max_key.type = BTRFS_INODE_EXTREF_KEY;
+                               max_key.type = BTRFS_XATTR_ITEM_KEY;
                                ret = drop_objectid_items(trans, log, path, ino,
                                                          max_key.type);
                        } else {
@@ -4106,20 +4251,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                                          &BTRFS_I(inode)->runtime_flags);
                                clear_bit(BTRFS_INODE_COPY_EVERYTHING,
                                          &BTRFS_I(inode)->runtime_flags);
-                               ret = btrfs_truncate_inode_items(trans, log,
-                                                                inode, 0, 0);
+                               while(1) {
+                                       ret = btrfs_truncate_inode_items(trans,
+                                                        log, inode, 0, 0);
+                                       if (ret != -EAGAIN)
+                                               break;
+                               }
                        }
-               } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING,
-                                   &BTRFS_I(inode)->runtime_flags) ||
+               } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+                                             &BTRFS_I(inode)->runtime_flags) ||
                           inode_only == LOG_INODE_EXISTS) {
-                       if (inode_only == LOG_INODE_ALL) {
-                               clear_bit(BTRFS_INODE_COPY_EVERYTHING,
-                                         &BTRFS_I(inode)->runtime_flags);
+                       if (inode_only == LOG_INODE_ALL)
                                fast_search = true;
-                               max_key.type = BTRFS_XATTR_ITEM_KEY;
-                       } else {
-                               max_key.type = BTRFS_INODE_EXTREF_KEY;
-                       }
+                       max_key.type = BTRFS_XATTR_ITEM_KEY;
                        ret = drop_objectid_items(trans, log, path, ino,
                                                  max_key.type);
                } else {
@@ -4277,15 +4421,18 @@ log_extents:
        }
 
        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
-               ret = log_directory_changes(trans, root, inode, path, dst_path);
+               ret = log_directory_changes(trans, root, inode, path, dst_path,
+                                           ctx);
                if (ret) {
                        err = ret;
                        goto out_unlock;
                }
        }
 
+       spin_lock(&BTRFS_I(inode)->lock);
        BTRFS_I(inode)->logged_trans = trans->transid;
        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
+       spin_unlock(&BTRFS_I(inode)->lock);
 out_unlock:
        if (unlikely(err))
                btrfs_put_logged_extents(&logged_list);
@@ -4372,6 +4519,181 @@ out:
        return ret;
 }
 
+struct btrfs_dir_list {
+       u64 ino;
+       struct list_head list;
+};
+
+/*
+ * Log the inodes of the new dentries of a directory. See log_dir_items() for
+ * details about the why it is needed.
+ * This is a recursive operation - if an existing dentry corresponds to a
+ * directory, that directory's new entries are logged too (same behaviour as
+ * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * the dentries point to we do not lock their i_mutex, otherwise lockdep
+ * complains about the following circular lock dependency / possible deadlock:
+ *
+ *        CPU0                                        CPU1
+ *        ----                                        ----
+ * lock(&type->i_mutex_dir_key#3/2);
+ *                                            lock(sb_internal#2);
+ *                                            lock(&type->i_mutex_dir_key#3/2);
+ * lock(&sb->s_type->i_mutex_key#14);
+ *
+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
+ * sb_start_intwrite() in btrfs_start_transaction().
+ * Not locking i_mutex of the inodes is still safe because:
+ *
+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
+ *    that while logging the inode new references (names) are added or removed
+ *    from the inode, leaving the logged inode item with a link count that does
+ *    not match the number of logged inode reference items. This is fine because
+ *    at log replay time we compute the real number of links and correct the
+ *    link count in the inode item (see replay_one_buffer() and
+ *    link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+ *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
+ *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
+ *    has a size that doesn't match the sum of the lengths of all the logged
+ *    names. This does not result in a problem because if a dir_item key is
+ *    logged but its matching dir_index key is not logged, at log replay time we
+ *    don't use it to replay the respective name (see replay_one_name()). On the
+ *    other hand if only the dir_index key ends up being logged, the respective
+ *    name is added to the fs/subvol tree with both the dir_item and dir_index
+ *    keys created (see replay_one_name()).
+ *    The directory's inode item with a wrong i_size is not a problem as well,
+ *    since we don't use it at log replay time to set the i_size in the inode
+ *    item of the fs/subvol tree (see overwrite_item()).
+ */
+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct inode *start_inode,
+                               struct btrfs_log_ctx *ctx)
+{
+       struct btrfs_root *log = root->log_root;
+       struct btrfs_path *path;
+       LIST_HEAD(dir_list);
+       struct btrfs_dir_list *dir_elem;
+       int ret = 0;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
+       if (!dir_elem) {
+               btrfs_free_path(path);
+               return -ENOMEM;
+       }
+       dir_elem->ino = btrfs_ino(start_inode);
+       list_add_tail(&dir_elem->list, &dir_list);
+
+       while (!list_empty(&dir_list)) {
+               struct extent_buffer *leaf;
+               struct btrfs_key min_key;
+               int nritems;
+               int i;
+
+               dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
+                                           list);
+               if (ret)
+                       goto next_dir_inode;
+
+               min_key.objectid = dir_elem->ino;
+               min_key.type = BTRFS_DIR_ITEM_KEY;
+               min_key.offset = 0;
+again:
+               btrfs_release_path(path);
+               ret = btrfs_search_forward(log, &min_key, path, trans->transid);
+               if (ret < 0) {
+                       goto next_dir_inode;
+               } else if (ret > 0) {
+                       ret = 0;
+                       goto next_dir_inode;
+               }
+
+process_leaf:
+               leaf = path->nodes[0];
+               nritems = btrfs_header_nritems(leaf);
+               for (i = path->slots[0]; i < nritems; i++) {
+                       struct btrfs_dir_item *di;
+                       struct btrfs_key di_key;
+                       struct inode *di_inode;
+                       struct btrfs_dir_list *new_dir_elem;
+                       int log_mode = LOG_INODE_EXISTS;
+                       int type;
+
+                       btrfs_item_key_to_cpu(leaf, &min_key, i);
+                       if (min_key.objectid != dir_elem->ino ||
+                           min_key.type != BTRFS_DIR_ITEM_KEY)
+                               goto next_dir_inode;
+
+                       di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+                       type = btrfs_dir_type(leaf, di);
+                       if (btrfs_dir_transid(leaf, di) < trans->transid &&
+                           type != BTRFS_FT_DIR)
+                               continue;
+                       btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
+                       if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+                               continue;
+
+                       di_inode = btrfs_iget(root->fs_info->sb, &di_key,
+                                             root, NULL);
+                       if (IS_ERR(di_inode)) {
+                               ret = PTR_ERR(di_inode);
+                               goto next_dir_inode;
+                       }
+
+                       if (btrfs_inode_in_log(di_inode, trans->transid)) {
+                               iput(di_inode);
+                               continue;
+                       }
+
+                       ctx->log_new_dentries = false;
+                       if (type == BTRFS_FT_DIR)
+                               log_mode = LOG_INODE_ALL;
+                       btrfs_release_path(path);
+                       ret = btrfs_log_inode(trans, root, di_inode,
+                                             log_mode, 0, LLONG_MAX, ctx);
+                       iput(di_inode);
+                       if (ret)
+                               goto next_dir_inode;
+                       if (ctx->log_new_dentries) {
+                               new_dir_elem = kmalloc(sizeof(*new_dir_elem),
+                                                      GFP_NOFS);
+                               if (!new_dir_elem) {
+                                       ret = -ENOMEM;
+                                       goto next_dir_inode;
+                               }
+                               new_dir_elem->ino = di_key.objectid;
+                               list_add_tail(&new_dir_elem->list, &dir_list);
+                       }
+                       break;
+               }
+               if (i == nritems) {
+                       ret = btrfs_next_leaf(log, path);
+                       if (ret < 0) {
+                               goto next_dir_inode;
+                       } else if (ret > 0) {
+                               ret = 0;
+                               goto next_dir_inode;
+                       }
+                       goto process_leaf;
+               }
+               if (min_key.offset < (u64)-1) {
+                       min_key.offset++;
+                       goto again;
+               }
+next_dir_inode:
+               list_del(&dir_elem->list);
+               kfree(dir_elem);
+       }
+
+       btrfs_free_path(path);
+       return ret;
+}
+
 /*
  * helper function around btrfs_log_inode to make sure newly created
  * parent directories also end up in the log.  A minimal inode and backref
@@ -4394,6 +4716,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
        const struct dentry * const first_parent = parent;
        const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
                                 last_committed);
+       bool log_dentries = false;
+       struct inode *orig_inode = inode;
 
        sb = inode->i_sb;
 
@@ -4449,6 +4773,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                goto end_trans;
        }
 
+       if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
+               log_dentries = true;
+
        while (1) {
                if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
                        break;
@@ -4485,7 +4812,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                dput(old_parent);
                old_parent = parent;
        }
-       ret = 0;
+       if (log_dentries)
+               ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
+       else
+               ret = 0;
 end_trans:
        dput(old_parent);
        if (ret < 0) {