Merge branch 'for-linus-4.12' of git://git.kernel.org/pub/scm/linux/kernel/git/mason...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 10 May 2017 15:33:17 +0000 (08:33 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 10 May 2017 15:33:17 +0000 (08:33 -0700)
Pull btrfs updates from Chris Mason:
 "This has fixes and cleanups Dave Sterba collected for the merge
  window.

  The biggest functional fixes are between btrfs raid5/6 and scrub, and
  raid5/6 and device replacement. Some of our pending qgroup fixes are
  included as well while I bash on the rest in testing.

  We also have the usual set of cleanups, including one that makes
  __btrfs_map_block() much more maintainable, and conversions from
  atomic_t to refcount_t"

* 'for-linus-4.12' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (71 commits)
  btrfs: fix the gfp_mask for the reada_zones radix tree
  Btrfs: fix reported number of inode blocks
  Btrfs: send, fix file hole not being preserved due to inline extent
  Btrfs: fix extent map leak during fallocate error path
  Btrfs: fix incorrect space accounting after failure to insert inline extent
  Btrfs: fix invalid attempt to free reserved space on failure to cow range
  btrfs: Handle delalloc error correctly to avoid ordered extent hang
  btrfs: Fix metadata underflow caused by btrfs_reloc_clone_csum error
  btrfs: check if the device is flush capable
  btrfs: delete unused member nobarriers
  btrfs: scrub: Fix RAID56 recovery race condition
  btrfs: scrub: Introduce full stripe lock for RAID56
  btrfs: Use ktime_get_real_ts for root ctime
  Btrfs: handle only applicable errors returned by btrfs_get_extent
  btrfs: qgroup: Fix qgroup corruption caused by inode_cache mount option
  btrfs: use q which is already obtained from bdev_get_queue
  Btrfs: switch to div64_u64 if with a u64 divisor
  Btrfs: update scrub_parity to use u64 stripe_len
  Btrfs: enable repair during read for raid56 profile
  btrfs: use clear_page where appropriate
  ...

40 files changed:
MAINTAINERS
fs/btrfs/backref.c
fs/btrfs/btrfs_inode.h
fs/btrfs/compression.c
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-inode.c
fs/btrfs/delayed-inode.h
fs/btrfs/delayed-ref.c
fs/btrfs/delayed-ref.h
fs/btrfs/dev-replace.c
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/extent_map.c
fs/btrfs/extent_map.h
fs/btrfs/file.c
fs/btrfs/free-space-cache.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h
fs/btrfs/qgroup.c
fs/btrfs/qgroup.h
fs/btrfs/raid56.c
fs/btrfs/reada.c
fs/btrfs/root-tree.c
fs/btrfs/scrub.c
fs/btrfs/send.c
fs/btrfs/super.c
fs/btrfs/tests/btrfs-tests.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h
fs/btrfs/tree-log.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h
include/trace/events/btrfs.h
include/uapi/linux/btrfs.h

index f42daf74f5414b25d4636bd088b0863f43394fb1..5fb2e94dd3e5134366a8621befe10d47c994cd7c 100644 (file)
@@ -2926,6 +2926,8 @@ T:        git git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs.git
 S:     Maintained
 F:     Documentation/filesystems/btrfs.txt
 F:     fs/btrfs/
+F:     include/linux/btrfs*
+F:     include/uapi/linux/btrfs*
 
 BTTV VIDEO4LINUX DRIVER
 M:     Mauro Carvalho Chehab <mchehab@s-opensource.com>
index 7699e16784d313459181c746d0b8c30d468e23a7..24865da63d8fdfd2a979429a426b42fe8781e0aa 100644 (file)
 #include "delayed-ref.h"
 #include "locking.h"
 
+enum merge_mode {
+       MERGE_IDENTICAL_KEYS = 1,
+       MERGE_IDENTICAL_PARENTS,
+};
+
 /* Just an arbitrary number so we can be sure this happened */
 #define BACKREF_FOUND_SHARED 6
 
@@ -533,7 +538,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
         * slot==nritems. In that case, go to the next leaf before we continue.
         */
        if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
-               if (time_seq == (u64)-1)
+               if (time_seq == SEQ_LAST)
                        ret = btrfs_next_leaf(root, path);
                else
                        ret = btrfs_next_old_leaf(root, path, time_seq);
@@ -577,7 +582,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
                        eie = NULL;
                }
 next:
-               if (time_seq == (u64)-1)
+               if (time_seq == SEQ_LAST)
                        ret = btrfs_next_item(root, path);
                else
                        ret = btrfs_next_old_item(root, path, time_seq);
@@ -629,7 +634,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 
        if (path->search_commit_root)
                root_level = btrfs_header_level(root->commit_root);
-       else if (time_seq == (u64)-1)
+       else if (time_seq == SEQ_LAST)
                root_level = btrfs_header_level(root->node);
        else
                root_level = btrfs_old_root_level(root, time_seq);
@@ -640,7 +645,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
        }
 
        path->lowest_level = level;
-       if (time_seq == (u64)-1)
+       if (time_seq == SEQ_LAST)
                ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path,
                                        0, 0);
        else
@@ -809,14 +814,12 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
 /*
  * merge backrefs and adjust counts accordingly
  *
- * mode = 1: merge identical keys, if key is set
- *    FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
- *           additionally, we could even add a key range for the blocks we
- *           looked into to merge even more (-> replace unresolved refs by those
- *           having a parent).
- * mode = 2: merge identical parents
+ *    FIXME: For MERGE_IDENTICAL_KEYS, if we add more keys in __add_prelim_ref
+ *           then we can merge more here. Additionally, we could even add a key
+ *           range for the blocks we looked into to merge even more (-> replace
+ *           unresolved refs by those having a parent).
  */
-static void __merge_refs(struct list_head *head, int mode)
+static void __merge_refs(struct list_head *head, enum merge_mode mode)
 {
        struct __prelim_ref *pos1;
 
@@ -829,7 +832,7 @@ static void __merge_refs(struct list_head *head, int mode)
 
                        if (!ref_for_same_block(ref1, ref2))
                                continue;
-                       if (mode == 1) {
+                       if (mode == MERGE_IDENTICAL_KEYS) {
                                if (!ref1->parent && ref2->parent)
                                        swap(ref1, ref2);
                        } else {
@@ -1196,7 +1199,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
  *
  * NOTE: This can return values > 0
  *
- * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave
+ * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave
  * much like trans == NULL case, the difference only lies in it will not
  * commit root.
  * The special case is for qgroup to search roots in commit_transaction().
@@ -1243,7 +1246,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
                path->skip_locking = 1;
        }
 
-       if (time_seq == (u64)-1)
+       if (time_seq == SEQ_LAST)
                path->skip_locking = 1;
 
        /*
@@ -1273,9 +1276,9 @@ again:
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
        if (trans && likely(trans->type != __TRANS_DUMMY) &&
-           time_seq != (u64)-1) {
+           time_seq != SEQ_LAST) {
 #else
-       if (trans && time_seq != (u64)-1) {
+       if (trans && time_seq != SEQ_LAST) {
 #endif
                /*
                 * look if there are updates for this ref queued and lock the
@@ -1286,7 +1289,7 @@ again:
                head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
                if (head) {
                        if (!mutex_trylock(&head->mutex)) {
-                               atomic_inc(&head->node.refs);
+                               refcount_inc(&head->node.refs);
                                spin_unlock(&delayed_refs->lock);
 
                                btrfs_release_path(path);
@@ -1374,7 +1377,7 @@ again:
        if (ret)
                goto out;
 
-       __merge_refs(&prefs, 1);
+       __merge_refs(&prefs, MERGE_IDENTICAL_KEYS);
 
        ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
                                      extent_item_pos, total_refs,
@@ -1382,7 +1385,7 @@ again:
        if (ret)
                goto out;
 
-       __merge_refs(&prefs, 2);
+       __merge_refs(&prefs, MERGE_IDENTICAL_PARENTS);
 
        while (!list_empty(&prefs)) {
                ref = list_first_entry(&prefs, struct __prelim_ref, list);
index 0c6baaba0651ce10ba5e394ad82a03a917ced4e6..b8622e4d1744de68180f96036ad5ddbc3c195ca8 100644 (file)
@@ -124,6 +124,13 @@ struct btrfs_inode {
         */
        u64 delalloc_bytes;
 
+       /*
+        * Total number of bytes pending delalloc that fall within a file
+        * range that is either a hole or beyond EOF (and no prealloc extent
+        * exists in the range). This is always <= delalloc_bytes.
+        */
+       u64 new_delalloc_bytes;
+
        /*
         * total number of bytes pending defrag, used by stat to check whether
         * it needs COW.
index c7721a6aa3bb5346cbd9103b4ee3e9f4528a8c4e..10e6b282d09d6e8d31b4ac8740d0119ccad3e786 100644 (file)
@@ -44,7 +44,7 @@
 
 struct compressed_bio {
        /* number of bios pending for this compressed extent */
-       atomic_t pending_bios;
+       refcount_t pending_bios;
 
        /* the pages with the compressed data on them */
        struct page **compressed_pages;
@@ -161,7 +161,7 @@ static void end_compressed_bio_read(struct bio *bio)
        /* if there are more bios still pending for this compressed
         * extent, just exit
         */
-       if (!atomic_dec_and_test(&cb->pending_bios))
+       if (!refcount_dec_and_test(&cb->pending_bios))
                goto out;
 
        inode = cb->inode;
@@ -274,7 +274,7 @@ static void end_compressed_bio_write(struct bio *bio)
        /* if there are more bios still pending for this compressed
         * extent, just exit
         */
-       if (!atomic_dec_and_test(&cb->pending_bios))
+       if (!refcount_dec_and_test(&cb->pending_bios))
                goto out;
 
        /* ok, we're the last bio for this extent, step one is to
@@ -342,7 +342,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
        cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
        if (!cb)
                return -ENOMEM;
-       atomic_set(&cb->pending_bios, 0);
+       refcount_set(&cb->pending_bios, 0);
        cb->errors = 0;
        cb->inode = inode;
        cb->start = start;
@@ -363,7 +363,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
        bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
        bio->bi_private = cb;
        bio->bi_end_io = end_compressed_bio_write;
-       atomic_inc(&cb->pending_bios);
+       refcount_set(&cb->pending_bios, 1);
 
        /* create and submit bios for the compressed pages */
        bytes_left = compressed_len;
@@ -388,7 +388,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
                         * we inc the count.  Otherwise, the cb might get
                         * freed before we're done setting it up
                         */
-                       atomic_inc(&cb->pending_bios);
+                       refcount_inc(&cb->pending_bios);
                        ret = btrfs_bio_wq_end_io(fs_info, bio,
                                                  BTRFS_WQ_ENDIO_DATA);
                        BUG_ON(ret); /* -ENOMEM */
@@ -607,7 +607,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        if (!cb)
                goto out;
 
-       atomic_set(&cb->pending_bios, 0);
+       refcount_set(&cb->pending_bios, 0);
        cb->errors = 0;
        cb->inode = inode;
        cb->mirror_num = mirror_num;
@@ -656,7 +656,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        bio_set_op_attrs (comp_bio, REQ_OP_READ, 0);
        comp_bio->bi_private = cb;
        comp_bio->bi_end_io = end_compressed_bio_read;
-       atomic_inc(&cb->pending_bios);
+       refcount_set(&cb->pending_bios, 1);
 
        for (pg_index = 0; pg_index < nr_pages; pg_index++) {
                page = cb->compressed_pages[pg_index];
@@ -685,7 +685,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                         * we inc the count.  Otherwise, the cb might get
                         * freed before we're done setting it up
                         */
-                       atomic_inc(&cb->pending_bios);
+                       refcount_inc(&cb->pending_bios);
 
                        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
                                ret = btrfs_lookup_bio_sums(inode, comp_bio,
index 1c3b6c54d5eeff8038a456f9af1f83a4afb2698f..a3a75f1de002295c425f5957de7e9aba269a7682 100644 (file)
@@ -567,7 +567,7 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
 static noinline int
 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
                         struct extent_buffer *eb, int dst_slot, int src_slot,
-                        int nr_items, gfp_t flags)
+                        int nr_items)
 {
        struct tree_mod_elem *tm = NULL;
        struct tree_mod_elem **tm_list = NULL;
@@ -578,11 +578,11 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
        if (!tree_mod_need_log(fs_info, eb))
                return 0;
 
-       tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
+       tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
        if (!tm_list)
                return -ENOMEM;
 
-       tm = kzalloc(sizeof(*tm), flags);
+       tm = kzalloc(sizeof(*tm), GFP_NOFS);
        if (!tm) {
                ret = -ENOMEM;
                goto free_tms;
@@ -596,7 +596,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 
        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
                tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
-                   MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
+                   MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
                if (!tm_list[i]) {
                        ret = -ENOMEM;
                        goto free_tms;
@@ -663,7 +663,7 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 static noinline int
 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
                         struct extent_buffer *old_root,
-                        struct extent_buffer *new_root, gfp_t flags,
+                        struct extent_buffer *new_root,
                         int log_removal)
 {
        struct tree_mod_elem *tm = NULL;
@@ -678,14 +678,14 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
        if (log_removal && btrfs_header_level(old_root) > 0) {
                nritems = btrfs_header_nritems(old_root);
                tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
-                                 flags);
+                                 GFP_NOFS);
                if (!tm_list) {
                        ret = -ENOMEM;
                        goto free_tms;
                }
                for (i = 0; i < nritems; i++) {
                        tm_list[i] = alloc_tree_mod_elem(old_root, i,
-                           MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
+                           MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
                        if (!tm_list[i]) {
                                ret = -ENOMEM;
                                goto free_tms;
@@ -693,7 +693,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
                }
        }
 
-       tm = kzalloc(sizeof(*tm), flags);
+       tm = kzalloc(sizeof(*tm), GFP_NOFS);
        if (!tm) {
                ret = -ENOMEM;
                goto free_tms;
@@ -873,7 +873,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 {
        int ret;
        ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
-                                      nr_items, GFP_NOFS);
+                                      nr_items);
        BUG_ON(ret < 0);
 }
 
@@ -943,7 +943,7 @@ tree_mod_log_set_root_pointer(struct btrfs_root *root,
 {
        int ret;
        ret = tree_mod_log_insert_root(root->fs_info, root->node,
-                                      new_root_node, GFP_NOFS, log_removal);
+                                      new_root_node, log_removal);
        BUG_ON(ret < 0);
 }
 
index 3e21211e99c39571968f79c95d88eb3eaba6f262..643c70d2b2e65ab96a93ff4c022756ea7e59d179 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/security.h>
 #include <linux/sizes.h>
 #include <linux/dynamic_debug.h>
+#include <linux/refcount.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
@@ -518,7 +519,7 @@ struct btrfs_caching_control {
        struct btrfs_work work;
        struct btrfs_block_group_cache *block_group;
        u64 progress;
-       atomic_t count;
+       refcount_t count;
 };
 
 /* Once caching_thread() finds this much free space, it will wake up waiters. */
@@ -538,6 +539,14 @@ struct btrfs_io_ctl {
        unsigned check_crcs:1;
 };
 
+/*
+ * Tree to record all locked full stripes of a RAID5/6 block group
+ */
+struct btrfs_full_stripe_locks_tree {
+       struct rb_root root;
+       struct mutex lock;
+};
+
 struct btrfs_block_group_cache {
        struct btrfs_key key;
        struct btrfs_block_group_item item;
@@ -648,6 +657,9 @@ struct btrfs_block_group_cache {
         * Protected by free_space_lock.
         */
        int needs_free_space;
+
+       /* Record locked full stripes for RAID5/6 block group */
+       struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
 };
 
 /* delayed seq elem */
@@ -658,6 +670,8 @@ struct seq_list {
 
 #define SEQ_LIST_INIT(name)    { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
 
+#define SEQ_LAST       ((u64)-1)
+
 enum btrfs_orphan_cleanup_state {
        ORPHAN_CLEANUP_STARTED  = 1,
        ORPHAN_CLEANUP_DONE     = 2,
@@ -702,6 +716,11 @@ struct btrfs_delayed_root;
 #define BTRFS_FS_BTREE_ERR                     11
 #define BTRFS_FS_LOG1_ERR                      12
 #define BTRFS_FS_LOG2_ERR                      13
+/*
+ * Indicate that a whole-filesystem exclusive operation is running
+ * (device replace, resize, device add/delete, balance)
+ */
+#define BTRFS_FS_EXCL_OP                       14
 
 struct btrfs_fs_info {
        u8 fsid[BTRFS_FSID_SIZE];
@@ -1066,8 +1085,6 @@ struct btrfs_fs_info {
        /* device replace state */
        struct btrfs_dev_replace dev_replace;
 
-       atomic_t mutually_exclusive_operation_running;
-
        struct percpu_counter bio_counter;
        wait_queue_head_t replace_wait;
 
@@ -1220,7 +1237,7 @@ struct btrfs_root {
        dev_t anon_dev;
 
        spinlock_t root_item_lock;
-       atomic_t refs;
+       refcount_t refs;
 
        struct mutex delalloc_mutex;
        spinlock_t delalloc_lock;
@@ -3646,6 +3663,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
                           struct btrfs_device *dev);
 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
                         struct btrfs_scrub_progress *progress);
+static inline void btrfs_init_full_stripe_locks_tree(
+                       struct btrfs_full_stripe_locks_tree *locks_root)
+{
+       locks_root->root = RB_ROOT;
+       mutex_init(&locks_root->lock);
+}
 
 /* dev-replace.c */
 void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
@@ -3670,8 +3693,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
                              struct btrfs_key *start, struct btrfs_key *end);
 int btrfs_reada_wait(void *handle);
 void btrfs_reada_detach(void *handle);
-int btree_readahead_hook(struct btrfs_fs_info *fs_info,
-                        struct extent_buffer *eb, int err);
+int btree_readahead_hook(struct extent_buffer *eb, int err);
 
 static inline int is_fstree(u64 rootid)
 {
index 1aff676f0e5b5b6c63efd32eee44958b40968e2a..8ae409b5a61d7186f050780beced1bf72cd572ca 100644 (file)
@@ -52,7 +52,7 @@ static inline void btrfs_init_delayed_node(
 {
        delayed_node->root = root;
        delayed_node->inode_id = inode_id;
-       atomic_set(&delayed_node->refs, 0);
+       refcount_set(&delayed_node->refs, 0);
        delayed_node->ins_root = RB_ROOT;
        delayed_node->del_root = RB_ROOT;
        mutex_init(&delayed_node->mutex);
@@ -81,7 +81,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
 
        node = READ_ONCE(btrfs_inode->delayed_node);
        if (node) {
-               atomic_inc(&node->refs);
+               refcount_inc(&node->refs);
                return node;
        }
 
@@ -89,14 +89,14 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
        node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
        if (node) {
                if (btrfs_inode->delayed_node) {
-                       atomic_inc(&node->refs);        /* can be accessed */
+                       refcount_inc(&node->refs);      /* can be accessed */
                        BUG_ON(btrfs_inode->delayed_node != node);
                        spin_unlock(&root->inode_lock);
                        return node;
                }
                btrfs_inode->delayed_node = node;
                /* can be accessed and cached in the inode */
-               atomic_add(2, &node->refs);
+               refcount_add(2, &node->refs);
                spin_unlock(&root->inode_lock);
                return node;
        }
@@ -125,7 +125,7 @@ again:
        btrfs_init_delayed_node(node, root, ino);
 
        /* cached in the btrfs inode and can be accessed */
-       atomic_add(2, &node->refs);
+       refcount_set(&node->refs, 2);
 
        ret = radix_tree_preload(GFP_NOFS);
        if (ret) {
@@ -166,7 +166,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
        } else {
                list_add_tail(&node->n_list, &root->node_list);
                list_add_tail(&node->p_list, &root->prepare_list);
-               atomic_inc(&node->refs);        /* inserted into list */
+               refcount_inc(&node->refs);      /* inserted into list */
                root->nodes++;
                set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
        }
@@ -180,7 +180,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
        spin_lock(&root->lock);
        if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
                root->nodes--;
-               atomic_dec(&node->refs);        /* not in the list */
+               refcount_dec(&node->refs);      /* not in the list */
                list_del_init(&node->n_list);
                if (!list_empty(&node->p_list))
                        list_del_init(&node->p_list);
@@ -201,7 +201,7 @@ static struct btrfs_delayed_node *btrfs_first_delayed_node(
 
        p = delayed_root->node_list.next;
        node = list_entry(p, struct btrfs_delayed_node, n_list);
-       atomic_inc(&node->refs);
+       refcount_inc(&node->refs);
 out:
        spin_unlock(&delayed_root->lock);
 
@@ -228,7 +228,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node(
                p = node->n_list.next;
 
        next = list_entry(p, struct btrfs_delayed_node, n_list);
-       atomic_inc(&next->refs);
+       refcount_inc(&next->refs);
 out:
        spin_unlock(&delayed_root->lock);
 
@@ -253,11 +253,11 @@ static void __btrfs_release_delayed_node(
                btrfs_dequeue_delayed_node(delayed_root, delayed_node);
        mutex_unlock(&delayed_node->mutex);
 
-       if (atomic_dec_and_test(&delayed_node->refs)) {
+       if (refcount_dec_and_test(&delayed_node->refs)) {
                bool free = false;
                struct btrfs_root *root = delayed_node->root;
                spin_lock(&root->inode_lock);
-               if (atomic_read(&delayed_node->refs) == 0) {
+               if (refcount_read(&delayed_node->refs) == 0) {
                        radix_tree_delete(&root->delayed_nodes_tree,
                                          delayed_node->inode_id);
                        free = true;
@@ -286,7 +286,7 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
        p = delayed_root->prepare_list.next;
        list_del_init(p);
        node = list_entry(p, struct btrfs_delayed_node, p_list);
-       atomic_inc(&node->refs);
+       refcount_inc(&node->refs);
 out:
        spin_unlock(&delayed_root->lock);
 
@@ -308,7 +308,7 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
                item->ins_or_del = 0;
                item->bytes_reserved = 0;
                item->delayed_node = NULL;
-               atomic_set(&item->refs, 1);
+               refcount_set(&item->refs, 1);
        }
        return item;
 }
@@ -483,7 +483,7 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
 {
        if (item) {
                __btrfs_remove_delayed_item(item);
-               if (atomic_dec_and_test(&item->refs))
+               if (refcount_dec_and_test(&item->refs))
                        kfree(item);
        }
 }
@@ -1600,14 +1600,14 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
        mutex_lock(&delayed_node->mutex);
        item = __btrfs_first_delayed_insertion_item(delayed_node);
        while (item) {
-               atomic_inc(&item->refs);
+               refcount_inc(&item->refs);
                list_add_tail(&item->readdir_list, ins_list);
                item = __btrfs_next_delayed_item(item);
        }
 
        item = __btrfs_first_delayed_deletion_item(delayed_node);
        while (item) {
-               atomic_inc(&item->refs);
+               refcount_inc(&item->refs);
                list_add_tail(&item->readdir_list, del_list);
                item = __btrfs_next_delayed_item(item);
        }
@@ -1621,7 +1621,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
         * insert/delete delayed items in this period. So we also needn't
         * requeue or dequeue this delayed node.
         */
-       atomic_dec(&delayed_node->refs);
+       refcount_dec(&delayed_node->refs);
 
        return true;
 }
@@ -1634,13 +1634,13 @@ void btrfs_readdir_put_delayed_items(struct inode *inode,
 
        list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
                list_del(&curr->readdir_list);
-               if (atomic_dec_and_test(&curr->refs))
+               if (refcount_dec_and_test(&curr->refs))
                        kfree(curr);
        }
 
        list_for_each_entry_safe(curr, next, del_list, readdir_list) {
                list_del(&curr->readdir_list);
-               if (atomic_dec_and_test(&curr->refs))
+               if (refcount_dec_and_test(&curr->refs))
                        kfree(curr);
        }
 
@@ -1667,7 +1667,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
                list_del(&curr->readdir_list);
                ret = (curr->key.offset == index);
 
-               if (atomic_dec_and_test(&curr->refs))
+               if (refcount_dec_and_test(&curr->refs))
                        kfree(curr);
 
                if (ret)
@@ -1705,7 +1705,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
                list_del(&curr->readdir_list);
 
                if (curr->key.offset < ctx->pos) {
-                       if (atomic_dec_and_test(&curr->refs))
+                       if (refcount_dec_and_test(&curr->refs))
                                kfree(curr);
                        continue;
                }
@@ -1722,7 +1722,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
                over = !dir_emit(ctx, name, name_len,
                               location.objectid, d_type);
 
-               if (atomic_dec_and_test(&curr->refs))
+               if (refcount_dec_and_test(&curr->refs))
                        kfree(curr);
 
                if (over)
@@ -1963,7 +1963,7 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
                inode_id = delayed_nodes[n - 1]->inode_id + 1;
 
                for (i = 0; i < n; i++)
-                       atomic_inc(&delayed_nodes[i]->refs);
+                       refcount_inc(&delayed_nodes[i]->refs);
                spin_unlock(&root->inode_lock);
 
                for (i = 0; i < n; i++) {
index 40327cc3b99a3bdcd517827652969299e1ce2a2b..c4189d4959343219eadf1db68d8aa5dd3f8d6ee4 100644 (file)
@@ -26,7 +26,7 @@
 #include <linux/list.h>
 #include <linux/wait.h>
 #include <linux/atomic.h>
-
+#include <linux/refcount.h>
 #include "ctree.h"
 
 /* types of the delayed item */
@@ -67,7 +67,7 @@ struct btrfs_delayed_node {
        struct rb_root del_root;
        struct mutex mutex;
        struct btrfs_inode_item inode_item;
-       atomic_t refs;
+       refcount_t refs;
        u64 index_cnt;
        unsigned long flags;
        int count;
@@ -80,7 +80,7 @@ struct btrfs_delayed_item {
        struct list_head readdir_list;  /* used for readdir items */
        u64 bytes_reserved;
        struct btrfs_delayed_node *delayed_node;
-       atomic_t refs;
+       refcount_t refs;
        int ins_or_del;
        u32 data_len;
        char data[0];
index 6eb80952efb3310ae55de9c2e234319abe14465a..be70d90dfee591953d53100b3c872d0efc5f324f 100644 (file)
@@ -164,7 +164,7 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
        if (mutex_trylock(&head->mutex))
                return 0;
 
-       atomic_inc(&head->node.refs);
+       refcount_inc(&head->node.refs);
        spin_unlock(&delayed_refs->lock);
 
        mutex_lock(&head->mutex);
@@ -590,7 +590,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
        delayed_refs = &trans->transaction->delayed_refs;
 
        /* first set the basic ref node struct up */
-       atomic_set(&ref->refs, 1);
+       refcount_set(&ref->refs, 1);
        ref->bytenr = bytenr;
        ref->num_bytes = num_bytes;
        ref->ref_mod = count_mod;
@@ -682,7 +682,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
        delayed_refs = &trans->transaction->delayed_refs;
 
        /* first set the basic ref node struct up */
-       atomic_set(&ref->refs, 1);
+       refcount_set(&ref->refs, 1);
        ref->bytenr = bytenr;
        ref->num_bytes = num_bytes;
        ref->ref_mod = 1;
@@ -739,7 +739,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
                seq = atomic64_read(&fs_info->tree_mod_seq);
 
        /* first set the basic ref node struct up */
-       atomic_set(&ref->refs, 1);
+       refcount_set(&ref->refs, 1);
        ref->bytenr = bytenr;
        ref->num_bytes = num_bytes;
        ref->ref_mod = 1;
index 0e537f98f1a1c63c529c118baed1cac26efa194e..c0264ff01b53cfe9e2fa44a7cdd7e9352012f0ee 100644 (file)
@@ -18,6 +18,8 @@
 #ifndef __DELAYED_REF__
 #define __DELAYED_REF__
 
+#include <linux/refcount.h>
+
 /* these are the possible values of struct btrfs_delayed_ref_node->action */
 #define BTRFS_ADD_DELAYED_REF    1 /* add one backref to the tree */
 #define BTRFS_DROP_DELAYED_REF   2 /* delete one backref from the tree */
@@ -53,7 +55,7 @@ struct btrfs_delayed_ref_node {
        u64 seq;
 
        /* ref count on this data structure */
-       atomic_t refs;
+       refcount_t refs;
 
        /*
         * how many refs is this entry adding or deleting.  For
@@ -220,8 +222,8 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
 
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
 {
-       WARN_ON(atomic_read(&ref->refs) == 0);
-       if (atomic_dec_and_test(&ref->refs)) {
+       WARN_ON(refcount_read(&ref->refs) == 0);
+       if (refcount_dec_and_test(&ref->refs)) {
                WARN_ON(ref->in_tree);
                switch (ref->type) {
                case BTRFS_TREE_BLOCK_REF_KEY:
index e653921f05d93936581785553a8964124f5df1c0..5fe1ca8abc70577fe28ee6fd69899d11b618d479 100644 (file)
@@ -546,8 +546,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                mutex_unlock(&fs_info->chunk_mutex);
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
                mutex_unlock(&uuid_mutex);
+               btrfs_rm_dev_replace_blocked(fs_info);
                if (tgt_device)
                        btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+               btrfs_rm_dev_replace_unblocked(fs_info);
                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 
                return scrub_ret;
@@ -665,7 +667,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
                srcdev = dev_replace->srcdev;
-               args->status.progress_1000 = div_u64(dev_replace->cursor_left,
+               args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
                        div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
                break;
        }
@@ -784,8 +786,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
        }
        btrfs_dev_replace_unlock(dev_replace, 1);
 
-       WARN_ON(atomic_xchg(
-               &fs_info->mutually_exclusive_operation_running, 1));
+       WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
        task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
        return PTR_ERR_OR_ZERO(task);
 }
@@ -814,7 +815,7 @@ static int btrfs_dev_replace_kthread(void *data)
                        (unsigned int)progress);
        }
        btrfs_dev_replace_continue_on_mount(fs_info);
-       atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+       clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
 
        return 0;
 }
index 061c1d1f774f289d854ea6114a74aaf9c2a67b84..8685d67185d01bf90bcd2cf6d7cdd168e044c777 100644 (file)
@@ -762,7 +762,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 err:
        if (reads_done &&
            test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
-               btree_readahead_hook(fs_info, eb, ret);
+               btree_readahead_hook(eb, ret);
 
        if (ret) {
                /*
@@ -787,7 +787,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
        eb->read_mirror = failed_mirror;
        atomic_dec(&eb->io_pages);
        if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
-               btree_readahead_hook(eb->fs_info, eb, -EIO);
+               btree_readahead_hook(eb, -EIO);
        return -EIO;    /* we fixed nothing */
 }
 
@@ -1340,7 +1340,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
        atomic_set(&root->log_writers, 0);
        atomic_set(&root->log_batch, 0);
        atomic_set(&root->orphan_inodes, 0);
-       atomic_set(&root->refs, 1);
+       refcount_set(&root->refs, 1);
        atomic_set(&root->will_be_snapshoted, 0);
        atomic64_set(&root->qgroup_meta_rsv, 0);
        root->log_transid = 0;
@@ -3497,10 +3497,11 @@ static void btrfs_end_empty_barrier(struct bio *bio)
  */
 static int write_dev_flush(struct btrfs_device *device, int wait)
 {
+       struct request_queue *q = bdev_get_queue(device->bdev);
        struct bio *bio;
        int ret = 0;
 
-       if (device->nobarriers)
+       if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
                return 0;
 
        if (wait) {
@@ -4321,7 +4322,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                head = rb_entry(node, struct btrfs_delayed_ref_head,
                                href_node);
                if (!mutex_trylock(&head->mutex)) {
-                       atomic_inc(&head->node.refs);
+                       refcount_inc(&head->node.refs);
                        spin_unlock(&delayed_refs->lock);
 
                        mutex_lock(&head->mutex);
@@ -4593,7 +4594,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
                t = list_first_entry(&fs_info->trans_list,
                                     struct btrfs_transaction, list);
                if (t->state >= TRANS_STATE_COMMIT_START) {
-                       atomic_inc(&t->use_count);
+                       refcount_inc(&t->use_count);
                        spin_unlock(&fs_info->trans_lock);
                        btrfs_wait_for_commit(fs_info, t->transid);
                        btrfs_put_transaction(t);
index 2e0ec29bfd69f04b4232b75754010594bd3d5f95..21f1ceb85b76737a67c1ffbc02cbd725b09fb510 100644 (file)
@@ -101,14 +101,14 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
  */
 static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
 {
-       if (atomic_inc_not_zero(&root->refs))
+       if (refcount_inc_not_zero(&root->refs))
                return root;
        return NULL;
 }
 
 static inline void btrfs_put_fs_root(struct btrfs_root *root)
 {
-       if (atomic_dec_and_test(&root->refs))
+       if (refcount_dec_and_test(&root->refs))
                kfree(root);
 }
 
index be5477676cc829e4efe89349fc9b7df540fd0dff..e390451c72e6cdb93492e519cea82d5d7b3dfaf9 100644 (file)
@@ -131,6 +131,16 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
        if (atomic_dec_and_test(&cache->count)) {
                WARN_ON(cache->pinned > 0);
                WARN_ON(cache->reserved > 0);
+
+               /*
+                * If not empty, someone is still holding mutex of
+                * full_stripe_lock, which can only be released by caller.
+                * And it will definitely cause use-after-free when caller
+                * tries to release full stripe lock.
+                *
+                * No better way to resolve, but only to warn.
+                */
+               WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
                kfree(cache->free_space_ctl);
                kfree(cache);
        }
@@ -316,14 +326,14 @@ get_caching_control(struct btrfs_block_group_cache *cache)
        }
 
        ctl = cache->caching_ctl;
-       atomic_inc(&ctl->count);
+       refcount_inc(&ctl->count);
        spin_unlock(&cache->lock);
        return ctl;
 }
 
 static void put_caching_control(struct btrfs_caching_control *ctl)
 {
-       if (atomic_dec_and_test(&ctl->count))
+       if (refcount_dec_and_test(&ctl->count))
                kfree(ctl);
 }
 
@@ -599,7 +609,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        init_waitqueue_head(&caching_ctl->wait);
        caching_ctl->block_group = cache;
        caching_ctl->progress = cache->key.objectid;
-       atomic_set(&caching_ctl->count, 1);
+       refcount_set(&caching_ctl->count, 1);
        btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
                        caching_thread, NULL, NULL);
 
@@ -620,7 +630,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                struct btrfs_caching_control *ctl;
 
                ctl = cache->caching_ctl;
-               atomic_inc(&ctl->count);
+               refcount_inc(&ctl->count);
                prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
                spin_unlock(&cache->lock);
 
@@ -707,7 +717,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        }
 
        down_write(&fs_info->commit_root_sem);
-       atomic_inc(&caching_ctl->count);
+       refcount_inc(&caching_ctl->count);
        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
        up_write(&fs_info->commit_root_sem);
 
@@ -892,7 +902,7 @@ search_again:
        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
        if (head) {
                if (!mutex_trylock(&head->mutex)) {
-                       atomic_inc(&head->node.refs);
+                       refcount_inc(&head->node.refs);
                        spin_unlock(&delayed_refs->lock);
 
                        btrfs_release_path(path);
@@ -2980,7 +2990,7 @@ again:
                                struct btrfs_delayed_ref_node *ref;
 
                                ref = &head->node;
-                               atomic_inc(&ref->refs);
+                               refcount_inc(&ref->refs);
 
                                spin_unlock(&delayed_refs->lock);
                                /*
@@ -3003,7 +3013,6 @@ again:
                goto again;
        }
 out:
-       assert_qgroups_uptodate(trans);
        trans->can_flush_pending_bgs = can_flush_pending_bgs;
        return 0;
 }
@@ -3057,7 +3066,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
        }
 
        if (!mutex_trylock(&head->mutex)) {
-               atomic_inc(&head->node.refs);
+               refcount_inc(&head->node.refs);
                spin_unlock(&delayed_refs->lock);
 
                btrfs_release_path(path);
@@ -3443,7 +3452,8 @@ again:
                /*
                 * don't bother trying to write stuff out _if_
                 * a) we're not cached,
-                * b) we're with nospace_cache mount option.
+                * b) we're with nospace_cache mount option,
+                * c) we're with v2 space_cache (FREE_SPACE_TREE).
                 */
                dcs = BTRFS_DC_WRITTEN;
                spin_unlock(&block_group->lock);
@@ -9917,6 +9927,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
        btrfs_init_free_space_ctl(cache);
        atomic_set(&cache->trimming, 0);
        mutex_init(&cache->free_space_lock);
+       btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
 
        return cache;
 }
@@ -10416,7 +10427,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                                    &fs_info->caching_block_groups, list)
                                if (ctl->block_group == block_group) {
                                        caching_ctl = ctl;
-                                       atomic_inc(&caching_ctl->count);
+                                       refcount_inc(&caching_ctl->count);
                                        break;
                                }
                }
@@ -10850,7 +10861,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
                spin_lock(&fs_info->trans_lock);
                trans = fs_info->running_transaction;
                if (trans)
-                       atomic_inc(&trans->use_count);
+                       refcount_inc(&trans->use_count);
                spin_unlock(&fs_info->trans_lock);
 
                ret = find_free_dev_extent_start(trans, device, minlen, start,
index 27fdb250b4467f65a8c6a42d06835f3bb3a36aec..d8da3edf2ac39ebcc0bde0ede7da74f0f81ad9ea 100644 (file)
@@ -68,7 +68,7 @@ void btrfs_leak_debug_check(void)
                pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
                       state->start, state->end, state->state,
                       extent_state_in_tree(state),
-                      atomic_read(&state->refs));
+                      refcount_read(&state->refs));
                list_del(&state->leak_list);
                kmem_cache_free(extent_state_cache, state);
        }
@@ -238,7 +238,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
        state->failrec = NULL;
        RB_CLEAR_NODE(&state->rb_node);
        btrfs_leak_debug_add(&state->leak_list, &states);
-       atomic_set(&state->refs, 1);
+       refcount_set(&state->refs, 1);
        init_waitqueue_head(&state->wq);
        trace_alloc_extent_state(state, mask, _RET_IP_);
        return state;
@@ -248,7 +248,7 @@ void free_extent_state(struct extent_state *state)
 {
        if (!state)
                return;
-       if (atomic_dec_and_test(&state->refs)) {
+       if (refcount_dec_and_test(&state->refs)) {
                WARN_ON(extent_state_in_tree(state));
                btrfs_leak_debug_del(&state->leak_list);
                trace_free_extent_state(state, _RET_IP_);
@@ -641,7 +641,7 @@ again:
                if (cached && extent_state_in_tree(cached) &&
                    cached->start <= start && cached->end > start) {
                        if (clear)
-                               atomic_dec(&cached->refs);
+                               refcount_dec(&cached->refs);
                        state = cached;
                        goto hit_next;
                }
@@ -793,7 +793,7 @@ process_node:
 
                if (state->state & bits) {
                        start = state->start;
-                       atomic_inc(&state->refs);
+                       refcount_inc(&state->refs);
                        wait_on_state(tree, state);
                        free_extent_state(state);
                        goto again;
@@ -834,7 +834,7 @@ static void cache_state_if_flags(struct extent_state *state,
        if (cached_ptr && !(*cached_ptr)) {
                if (!flags || (state->state & flags)) {
                        *cached_ptr = state;
-                       atomic_inc(&state->refs);
+                       refcount_inc(&state->refs);
                }
        }
 }
@@ -1538,7 +1538,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
                if (!found) {
                        *start = state->start;
                        *cached_state = state;
-                       atomic_inc(&state->refs);
+                       refcount_inc(&state->refs);
                }
                found++;
                *end = state->end;
@@ -2004,16 +2004,11 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
        u64 map_length = 0;
        u64 sector;
        struct btrfs_bio *bbio = NULL;
-       struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
        int ret;
 
        ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
        BUG_ON(!mirror_num);
 
-       /* we can't repair anything in raid56 yet */
-       if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
-               return 0;
-
        bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
        if (!bio)
                return -EIO;
@@ -2026,17 +2021,35 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
         * read repair operation.
         */
        btrfs_bio_counter_inc_blocked(fs_info);
-       ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
-                             &map_length, &bbio, mirror_num);
-       if (ret) {
-               btrfs_bio_counter_dec(fs_info);
-               bio_put(bio);
-               return -EIO;
+       if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) {
+               /*
+                * Note that we don't use BTRFS_MAP_WRITE because it's supposed
+                * to update all raid stripes, but here we just want to correct
+                * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
+                * stripe's dev and sector.
+                */
+               ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+                                     &map_length, &bbio, 0);
+               if (ret) {
+                       btrfs_bio_counter_dec(fs_info);
+                       bio_put(bio);
+                       return -EIO;
+               }
+               ASSERT(bbio->mirror_num == 1);
+       } else {
+               ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
+                                     &map_length, &bbio, mirror_num);
+               if (ret) {
+                       btrfs_bio_counter_dec(fs_info);
+                       bio_put(bio);
+                       return -EIO;
+               }
+               BUG_ON(mirror_num != bbio->mirror_num);
        }
-       BUG_ON(mirror_num != bbio->mirror_num);
-       sector = bbio->stripes[mirror_num-1].physical >> 9;
+
+       sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
        bio->bi_iter.bi_sector = sector;
-       dev = bbio->stripes[mirror_num-1].dev;
+       dev = bbio->stripes[bbio->mirror_num - 1].dev;
        btrfs_put_bbio(bbio);
        if (!dev || !dev->bdev || !dev->writeable) {
                btrfs_bio_counter_dec(fs_info);
@@ -2859,7 +2872,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
                em = *em_cached;
                if (extent_map_in_tree(em) && start >= em->start &&
                    start < extent_map_end(em)) {
-                       atomic_inc(&em->refs);
+                       refcount_inc(&em->refs);
                        return em;
                }
 
@@ -2870,7 +2883,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
        em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0);
        if (em_cached && !IS_ERR_OR_NULL(em)) {
                BUG_ON(*em_cached);
-               atomic_inc(&em->refs);
+               refcount_inc(&em->refs);
                *em_cached = em;
        }
        return em;
index 3e4fad4a909d110d9283f979ccb9dec9a48c607c..1eafa2f0ede370ae802bb882557b3d4ad5c26340 100644 (file)
@@ -2,6 +2,7 @@
 #define __EXTENTIO__
 
 #include <linux/rbtree.h>
+#include <linux/refcount.h>
 #include "ulist.h"
 
 /* bits for the extent state */
 #define EXTENT_DEFRAG          (1U << 6)
 #define EXTENT_BOUNDARY                (1U << 9)
 #define EXTENT_NODATASUM       (1U << 10)
-#define EXTENT_DO_ACCOUNTING   (1U << 11)
+#define EXTENT_CLEAR_META_RESV (1U << 11)
 #define EXTENT_FIRST_DELALLOC  (1U << 12)
 #define EXTENT_NEED_WAIT       (1U << 13)
 #define EXTENT_DAMAGED         (1U << 14)
 #define EXTENT_NORESERVE       (1U << 15)
 #define EXTENT_QGROUP_RESERVED (1U << 16)
 #define EXTENT_CLEAR_DATA_RESV (1U << 17)
+#define EXTENT_DELALLOC_NEW    (1U << 18)
 #define EXTENT_IOBITS          (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_DO_ACCOUNTING    (EXTENT_CLEAR_META_RESV | \
+                                EXTENT_CLEAR_DATA_RESV)
 #define EXTENT_CTLBITS         (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
 /*
@@ -143,7 +147,7 @@ struct extent_state {
 
        /* ADD NEW ELEMENTS AFTER THIS */
        wait_queue_head_t wq;
-       atomic_t refs;
+       refcount_t refs;
        unsigned state;
 
        struct io_failure_record *failrec;
index 26f9ac719d20b4bff1a6b0a456ca45dd1752b4c7..69850155870c067d82768c67f3895a2e7a7c487d 100644 (file)
@@ -55,7 +55,7 @@ struct extent_map *alloc_extent_map(void)
        em->flags = 0;
        em->compress_type = BTRFS_COMPRESS_NONE;
        em->generation = 0;
-       atomic_set(&em->refs, 1);
+       refcount_set(&em->refs, 1);
        INIT_LIST_HEAD(&em->list);
        return em;
 }
@@ -71,8 +71,8 @@ void free_extent_map(struct extent_map *em)
 {
        if (!em)
                return;
-       WARN_ON(atomic_read(&em->refs) == 0);
-       if (atomic_dec_and_test(&em->refs)) {
+       WARN_ON(refcount_read(&em->refs) == 0);
+       if (refcount_dec_and_test(&em->refs)) {
                WARN_ON(extent_map_in_tree(em));
                WARN_ON(!list_empty(&em->list));
                if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
@@ -322,7 +322,7 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree,
                                        struct extent_map *em,
                                        int modified)
 {
-       atomic_inc(&em->refs);
+       refcount_inc(&em->refs);
        em->mod_start = em->start;
        em->mod_len = em->len;
 
@@ -381,7 +381,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
        if (strict && !(end > em->start && start < extent_map_end(em)))
                return NULL;
 
-       atomic_inc(&em->refs);
+       refcount_inc(&em->refs);
        return em;
 }
 
index eb8b8fae036bc3c67ceea03220cdca503626546f..a67b2def54131f10326c71092f80f2cd2d706212 100644 (file)
@@ -2,6 +2,7 @@
 #define __EXTENTMAP__
 
 #include <linux/rbtree.h>
+#include <linux/refcount.h>
 
 #define EXTENT_MAP_LAST_BYTE ((u64)-4)
 #define EXTENT_MAP_HOLE ((u64)-3)
@@ -41,7 +42,7 @@ struct extent_map {
                 */
                struct map_lookup *map_lookup;
        };
-       atomic_t refs;
+       refcount_t refs;
        unsigned int compress_type;
        struct list_head list;
 };
index 520cb7230b2d2cb5ca798c0030fa446957799456..da1096eb1a406f648b1bb0c7f3ee1da0e3013646 100644 (file)
@@ -1404,6 +1404,47 @@ fail:
 
 }
 
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+                                        const u64 start,
+                                        const u64 len,
+                                        struct extent_state **cached_state)
+{
+       u64 search_start = start;
+       const u64 end = start + len - 1;
+
+       while (search_start < end) {
+               const u64 search_len = end - search_start + 1;
+               struct extent_map *em;
+               u64 em_len;
+               int ret = 0;
+
+               em = btrfs_get_extent(inode, NULL, 0, search_start,
+                                     search_len, 0);
+               if (IS_ERR(em))
+                       return PTR_ERR(em);
+
+               if (em->block_start != EXTENT_MAP_HOLE)
+                       goto next;
+
+               em_len = em->len;
+               if (em->start < search_start)
+                       em_len -= search_start - em->start;
+               if (em_len > search_len)
+                       em_len = search_len;
+
+               ret = set_extent_bit(&inode->io_tree, search_start,
+                                    search_start + em_len - 1,
+                                    EXTENT_DELALLOC_NEW,
+                                    NULL, cached_state, GFP_NOFS);
+next:
+               search_start = extent_map_end(em);
+               free_extent_map(em);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
 /*
  * This function locks the extent and properly waits for data=ordered extents
  * to finish before allowing the pages to be modified if need.
@@ -1432,8 +1473,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
                + round_up(pos + write_bytes - start_pos,
                           fs_info->sectorsize) - 1;
 
-       if (start_pos < inode->vfs_inode.i_size) {
+       if (start_pos < inode->vfs_inode.i_size ||
+           (inode->flags & BTRFS_INODE_PREALLOC)) {
                struct btrfs_ordered_extent *ordered;
+               unsigned int clear_bits;
+
                lock_extent_bits(&inode->io_tree, start_pos, last_pos,
                                cached_state);
                ordered = btrfs_lookup_ordered_range(inode, start_pos,
@@ -1454,11 +1498,19 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
                }
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
-
+               ret = btrfs_find_new_delalloc_bytes(inode, start_pos,
+                                                   last_pos - start_pos + 1,
+                                                   cached_state);
+               clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC |
+                       EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG;
+               if (ret)
+                       clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED;
                clear_extent_bit(&inode->io_tree, start_pos,
-                                 last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
-                                 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
-                                 0, 0, cached_state, GFP_NOFS);
+                                last_pos, clear_bits,
+                                (clear_bits & EXTENT_LOCKED) ? 1 : 0,
+                                0, cached_state, GFP_NOFS);
+               if (ret)
+                       return ret;
                *lockstart = start_pos;
                *lockend = last_pos;
                ret = 1;
@@ -2342,13 +2394,8 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
        int ret = 0;
 
        em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, *start, *len, 0);
-       if (IS_ERR_OR_NULL(em)) {
-               if (!em)
-                       ret = -ENOMEM;
-               else
-                       ret = PTR_ERR(em);
-               return ret;
-       }
+       if (IS_ERR(em))
+               return PTR_ERR(em);
 
        /* Hole or vacuum extent(only exists in no-hole mode) */
        if (em->block_start == EXTENT_MAP_HOLE) {
@@ -2835,11 +2882,8 @@ static long btrfs_fallocate(struct file *file, int mode,
        while (1) {
                em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
                                      alloc_end - cur_offset, 0);
-               if (IS_ERR_OR_NULL(em)) {
-                       if (!em)
-                               ret = -ENOMEM;
-                       else
-                               ret = PTR_ERR(em);
+               if (IS_ERR(em)) {
+                       ret = PTR_ERR(em);
                        break;
                }
                last_byte = min(extent_map_end(em), alloc_end);
@@ -2856,8 +2900,10 @@ static long btrfs_fallocate(struct file *file, int mode,
                        }
                        ret = btrfs_qgroup_reserve_data(inode, cur_offset,
                                        last_byte - cur_offset);
-                       if (ret < 0)
+                       if (ret < 0) {
+                               free_extent_map(em);
                                break;
+                       }
                } else {
                        /*
                         * Do not need to reserve unwritten extent for this
index da6841efac26b1be3509ad3e410c34e72b253a65..c5e6180cdb8c9250e3a525e311a42795c4831c32 100644 (file)
@@ -355,7 +355,7 @@ static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
        io_ctl->orig = io_ctl->cur;
        io_ctl->size = PAGE_SIZE;
        if (clear)
-               memset(io_ctl->cur, 0, PAGE_SIZE);
+               clear_page(io_ctl->cur);
 }
 
 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
index 5e71f1ea3391b034dc8e6f55f62d82dbe76e9811..17cbe9306fafd9b9e7247bec4308494ed770b28b 100644 (file)
@@ -115,6 +115,31 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
                                       u64 ram_bytes, int compress_type,
                                       int type);
 
+static void __endio_write_update_ordered(struct inode *inode,
+                                        const u64 offset, const u64 bytes,
+                                        const bool uptodate);
+
+/*
+ * Cleanup all submitted ordered extents in specified range to handle errors
+ * from the fill_dellaloc() callback.
+ *
+ * NOTE: caller must ensure that when an error happens, it can not call
+ * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
+ * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
+ * to be released, which we want to happen only when finishing the ordered
+ * extent (btrfs_finish_ordered_io()). Also note that the caller of the
+ * fill_delalloc() callback already does proper cleanup for the first page of
+ * the range, that is, it invokes the callback writepage_end_io_hook() for the
+ * range of the first page.
+ */
+static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
+                                                const u64 offset,
+                                                const u64 bytes)
+{
+       return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
+                                           bytes - PAGE_SIZE, false);
+}
+
 static int btrfs_dirty_inode(struct inode *inode);
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -547,7 +572,7 @@ cont:
                }
                if (ret <= 0) {
                        unsigned long clear_flags = EXTENT_DELALLOC |
-                               EXTENT_DEFRAG;
+                               EXTENT_DELALLOC_NEW | EXTENT_DEFRAG;
                        unsigned long page_error_op;
 
                        clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
@@ -565,8 +590,10 @@ cont:
                                                     PAGE_SET_WRITEBACK |
                                                     page_error_op |
                                                     PAGE_END_WRITEBACK);
-                       btrfs_free_reserved_data_space_noquota(inode, start,
-                                               end - start + 1);
+                       if (ret == 0)
+                               btrfs_free_reserved_data_space_noquota(inode,
+                                                              start,
+                                                              end - start + 1);
                        goto free_pages_out;
                }
        }
@@ -852,6 +879,7 @@ out_free:
                                     async_extent->start +
                                     async_extent->ram_size - 1,
                                     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
+                                    EXTENT_DELALLOC_NEW |
                                     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
                                     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
                                     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
@@ -918,10 +946,13 @@ static noinline int cow_file_range(struct inode *inode,
        u64 num_bytes;
        unsigned long ram_size;
        u64 disk_num_bytes;
-       u64 cur_alloc_size;
+       u64 cur_alloc_size = 0;
        u64 blocksize = fs_info->sectorsize;
        struct btrfs_key ins;
        struct extent_map *em;
+       unsigned clear_bits;
+       unsigned long page_ops;
+       bool extent_reserved = false;
        int ret = 0;
 
        if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
@@ -944,6 +975,7 @@ static noinline int cow_file_range(struct inode *inode,
                        extent_clear_unlock_delalloc(inode, start, end,
                                     delalloc_end, NULL,
                                     EXTENT_LOCKED | EXTENT_DELALLOC |
+                                    EXTENT_DELALLOC_NEW |
                                     EXTENT_DEFRAG, PAGE_UNLOCK |
                                     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
                                     PAGE_END_WRITEBACK);
@@ -966,14 +998,14 @@ static noinline int cow_file_range(struct inode *inode,
                        start + num_bytes - 1, 0);
 
        while (disk_num_bytes > 0) {
-               unsigned long op;
-
                cur_alloc_size = disk_num_bytes;
                ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
                                           fs_info->sectorsize, 0, alloc_hint,
                                           &ins, 1, 1);
                if (ret < 0)
                        goto out_unlock;
+               cur_alloc_size = ins.offset;
+               extent_reserved = true;
 
                ram_size = ins.offset;
                em = create_io_em(inode, start, ins.offset, /* len */
@@ -988,7 +1020,6 @@ static noinline int cow_file_range(struct inode *inode,
                        goto out_reserve;
                free_extent_map(em);
 
-               cur_alloc_size = ins.offset;
                ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
                                               ram_size, cur_alloc_size, 0);
                if (ret)
@@ -998,15 +1029,24 @@ static noinline int cow_file_range(struct inode *inode,
                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
                        ret = btrfs_reloc_clone_csums(inode, start,
                                                      cur_alloc_size);
+                       /*
+                        * Only drop cache here, and process as normal.
+                        *
+                        * We must not allow extent_clear_unlock_delalloc()
+                        * at out_unlock label to free meta of this ordered
+                        * extent, as its meta should be freed by
+                        * btrfs_finish_ordered_io().
+                        *
+                        * So we must continue until @start is increased to
+                        * skip current ordered extent.
+                        */
                        if (ret)
-                               goto out_drop_extent_cache;
+                               btrfs_drop_extent_cache(BTRFS_I(inode), start,
+                                               start + ram_size - 1, 0);
                }
 
                btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 
-               if (disk_num_bytes < cur_alloc_size)
-                       break;
-
                /* we're not doing compressed IO, don't unlock the first
                 * page (which the caller expects to stay locked), don't
                 * clear any dirty bits and don't set any writeback bits
@@ -1014,18 +1054,30 @@ static noinline int cow_file_range(struct inode *inode,
                 * Do set the Private2 bit so we know this page was properly
                 * setup for writepage
                 */
-               op = unlock ? PAGE_UNLOCK : 0;
-               op |= PAGE_SET_PRIVATE2;
+               page_ops = unlock ? PAGE_UNLOCK : 0;
+               page_ops |= PAGE_SET_PRIVATE2;
 
                extent_clear_unlock_delalloc(inode, start,
                                             start + ram_size - 1,
                                             delalloc_end, locked_page,
                                             EXTENT_LOCKED | EXTENT_DELALLOC,
-                                            op);
-               disk_num_bytes -= cur_alloc_size;
+                                            page_ops);
+               if (disk_num_bytes < cur_alloc_size)
+                       disk_num_bytes = 0;
+               else
+                       disk_num_bytes -= cur_alloc_size;
                num_bytes -= cur_alloc_size;
                alloc_hint = ins.objectid + ins.offset;
                start += cur_alloc_size;
+               extent_reserved = false;
+
+               /*
+                * btrfs_reloc_clone_csums() error, since start is increased
+                * extent_clear_unlock_delalloc() at out_unlock label won't
+                * free metadata of current ordered extent, we're OK to exit.
+                */
+               if (ret)
+                       goto out_unlock;
        }
 out:
        return ret;
@@ -1036,12 +1088,35 @@ out_reserve:
        btrfs_dec_block_group_reservations(fs_info, ins.objectid);
        btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
 out_unlock:
+       clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+               EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
+       page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
+               PAGE_END_WRITEBACK;
+       /*
+        * If we reserved an extent for our delalloc range (or a subrange) and
+        * failed to create the respective ordered extent, then it means that
+        * when we reserved the extent we decremented the extent's size from
+        * the data space_info's bytes_may_use counter and incremented the
+        * space_info's bytes_reserved counter by the same amount. We must make
+        * sure extent_clear_unlock_delalloc() does not try to decrement again
+        * the data space_info's bytes_may_use counter, therefore we do not pass
+        * it the flag EXTENT_CLEAR_DATA_RESV.
+        */
+       if (extent_reserved) {
+               extent_clear_unlock_delalloc(inode, start,
+                                            start + cur_alloc_size,
+                                            start + cur_alloc_size,
+                                            locked_page,
+                                            clear_bits,
+                                            page_ops);
+               start += cur_alloc_size;
+               if (start >= end)
+                       goto out;
+       }
        extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
                                     locked_page,
-                                    EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
-                                    EXTENT_DELALLOC | EXTENT_DEFRAG,
-                                    PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
-                                    PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+                                    clear_bits | EXTENT_CLEAR_DATA_RESV,
+                                    page_ops);
        goto out;
 }
 
@@ -1414,15 +1489,14 @@ out_check:
                BUG_ON(ret); /* -ENOMEM */
 
                if (root->root_key.objectid ==
-                   BTRFS_DATA_RELOC_TREE_OBJECTID) {
+                   BTRFS_DATA_RELOC_TREE_OBJECTID)
+                       /*
+                        * Error handled later, as we must prevent
+                        * extent_clear_unlock_delalloc() in error handler
+                        * from freeing metadata of created ordered extent.
+                        */
                        ret = btrfs_reloc_clone_csums(inode, cur_offset,
                                                      num_bytes);
-                       if (ret) {
-                               if (!nolock && nocow)
-                                       btrfs_end_write_no_snapshoting(root);
-                               goto error;
-                       }
-               }
 
                extent_clear_unlock_delalloc(inode, cur_offset,
                                             cur_offset + num_bytes - 1, end,
@@ -1434,6 +1508,14 @@ out_check:
                if (!nolock && nocow)
                        btrfs_end_write_no_snapshoting(root);
                cur_offset = extent_end;
+
+               /*
+                * btrfs_reloc_clone_csums() error, now we're OK to call error
+                * handler, as metadata for created ordered extent will only
+                * be freed by btrfs_finish_ordered_io().
+                */
+               if (ret)
+                       goto error;
                if (cur_offset > end)
                        break;
        }
@@ -1509,6 +1591,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
                ret = cow_file_range_async(inode, locked_page, start, end,
                                           page_started, nr_written);
        }
+       if (ret)
+               btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
        return ret;
 }
 
@@ -1693,6 +1777,14 @@ static void btrfs_set_bit_hook(struct inode *inode,
                        btrfs_add_delalloc_inodes(root, inode);
                spin_unlock(&BTRFS_I(inode)->lock);
        }
+
+       if (!(state->state & EXTENT_DELALLOC_NEW) &&
+           (*bits & EXTENT_DELALLOC_NEW)) {
+               spin_lock(&BTRFS_I(inode)->lock);
+               BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
+                       state->start;
+               spin_unlock(&BTRFS_I(inode)->lock);
+       }
 }
 
 /*
@@ -1722,7 +1814,7 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
 
                if (*bits & EXTENT_FIRST_DELALLOC) {
                        *bits &= ~EXTENT_FIRST_DELALLOC;
-               } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
+               } else if (!(*bits & EXTENT_CLEAR_META_RESV)) {
                        spin_lock(&inode->lock);
                        inode->outstanding_extents -= num_extents;
                        spin_unlock(&inode->lock);
@@ -1733,7 +1825,7 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
                 * don't need to call dellalloc_release_metadata if there is an
                 * error.
                 */
-               if (*bits & EXTENT_DO_ACCOUNTING &&
+               if (*bits & EXTENT_CLEAR_META_RESV &&
                    root != fs_info->tree_root)
                        btrfs_delalloc_release_metadata(inode, len);
 
@@ -1741,10 +1833,9 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
                if (btrfs_is_testing(fs_info))
                        return;
 
-               if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
-                   && do_list && !(state->state & EXTENT_NORESERVE)
-                   && (*bits & (EXTENT_DO_ACCOUNTING |
-                   EXTENT_CLEAR_DATA_RESV)))
+               if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
+                   do_list && !(state->state & EXTENT_NORESERVE) &&
+                   (*bits & EXTENT_CLEAR_DATA_RESV))
                        btrfs_free_reserved_data_space_noquota(
                                        &inode->vfs_inode,
                                        state->start, len);
@@ -1759,6 +1850,14 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
                        btrfs_del_delalloc_inode(root, inode);
                spin_unlock(&inode->lock);
        }
+
+       if ((state->state & EXTENT_DELALLOC_NEW) &&
+           (*bits & EXTENT_DELALLOC_NEW)) {
+               spin_lock(&inode->lock);
+               ASSERT(inode->new_delalloc_bytes >= len);
+               inode->new_delalloc_bytes -= len;
+               spin_unlock(&inode->lock);
+       }
 }
 
 /*
@@ -2791,6 +2890,13 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
        u64 logical_len = ordered_extent->len;
        bool nolock;
        bool truncated = false;
+       bool range_locked = false;
+       bool clear_new_delalloc_bytes = false;
+
+       if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
+           !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
+           !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
+               clear_new_delalloc_bytes = true;
 
        nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
 
@@ -2839,6 +2945,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                goto out;
        }
 
+       range_locked = true;
        lock_extent_bits(io_tree, ordered_extent->file_offset,
                         ordered_extent->file_offset + ordered_extent->len - 1,
                         &cached_state);
@@ -2864,7 +2971,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                trans = NULL;
-               goto out_unlock;
+               goto out;
        }
 
        trans->block_rsv = &fs_info->delalloc_block_rsv;
@@ -2896,7 +3003,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                           trans->transid);
        if (ret < 0) {
                btrfs_abort_transaction(trans, ret);
-               goto out_unlock;
+               goto out;
        }
 
        add_pending_csums(trans, inode, &ordered_extent->list);
@@ -2905,14 +3012,26 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
        ret = btrfs_update_inode_fallback(trans, root, inode);
        if (ret) { /* -ENOMEM or corruption */
                btrfs_abort_transaction(trans, ret);
-               goto out_unlock;
+               goto out;
        }
        ret = 0;
-out_unlock:
-       unlock_extent_cached(io_tree, ordered_extent->file_offset,
-                            ordered_extent->file_offset +
-                            ordered_extent->len - 1, &cached_state, GFP_NOFS);
 out:
+       if (range_locked || clear_new_delalloc_bytes) {
+               unsigned int clear_bits = 0;
+
+               if (range_locked)
+                       clear_bits |= EXTENT_LOCKED;
+               if (clear_new_delalloc_bytes)
+                       clear_bits |= EXTENT_DELALLOC_NEW;
+               clear_extent_bit(&BTRFS_I(inode)->io_tree,
+                                ordered_extent->file_offset,
+                                ordered_extent->file_offset +
+                                ordered_extent->len - 1,
+                                clear_bits,
+                                (clear_bits & EXTENT_LOCKED) ? 1 : 0,
+                                0, &cached_state, GFP_NOFS);
+       }
+
        if (root != fs_info->tree_root)
                btrfs_delalloc_release_metadata(BTRFS_I(inode),
                                ordered_extent->len);
@@ -4401,9 +4520,17 @@ search_again:
                        if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
                                item_end +=
                                    btrfs_file_extent_num_bytes(leaf, fi);
+
+                               trace_btrfs_truncate_show_fi_regular(
+                                       BTRFS_I(inode), leaf, fi,
+                                       found_key.offset);
                        } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                                item_end += btrfs_file_extent_inline_len(leaf,
                                                         path->slots[0], fi);
+
+                               trace_btrfs_truncate_show_fi_inline(
+                                       BTRFS_I(inode), leaf, fi, path->slots[0],
+                                       found_key.offset);
                        }
                        item_end--;
                }
@@ -4603,13 +4730,6 @@ error:
 
        btrfs_free_path(path);
 
-       if (err == 0) {
-               /* only inline file may have last_size != new_size */
-               if (new_size >= fs_info->sectorsize ||
-                   new_size > fs_info->max_inline)
-                       ASSERT(last_size == new_size);
-       }
-
        if (be_nice && bytes_deleted > SZ_32M) {
                unsigned long updates = trans->delayed_ref_updates;
                if (updates) {
@@ -6735,7 +6855,6 @@ static noinline int uncompress_inline(struct btrfs_path *path,
  *
  * This also copies inline extents directly into the page.
  */
-
 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
                struct page *page,
            size_t pg_offset, u64 start, u64 len,
@@ -6835,11 +6954,18 @@ again:
            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
                extent_end = extent_start +
                       btrfs_file_extent_num_bytes(leaf, item);
+
+               trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
+                                                      extent_start);
        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
                size_t size;
                size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
                extent_end = ALIGN(extent_start + size,
                                   fs_info->sectorsize);
+
+               trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
+                                                     path->slots[0],
+                                                     extent_start);
        }
 next:
        if (start >= extent_end) {
@@ -7037,19 +7163,17 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
        em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
        if (IS_ERR(em))
                return em;
-       if (em) {
-               /*
-                * if our em maps to
-                * -  a hole or
-                * -  a pre-alloc extent,
-                * there might actually be delalloc bytes behind it.
-                */
-               if (em->block_start != EXTENT_MAP_HOLE &&
-                   !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
-                       return em;
-               else
-                       hole_em = em;
-       }
+       /*
+        * If our em maps to:
+        * - a hole or
+        * - a pre-alloc extent,
+        * there might actually be delalloc bytes behind it.
+        */
+       if (em->block_start != EXTENT_MAP_HOLE &&
+           !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+               return em;
+       else
+               hole_em = em;
 
        /* check to see if we've wrapped (len == -1 or similar) */
        end = start + len;
@@ -8127,17 +8251,26 @@ static void btrfs_endio_direct_read(struct bio *bio)
        bio_put(bio);
 }
 
-static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
-                                                   const u64 offset,
-                                                   const u64 bytes,
-                                                   const int uptodate)
+static void __endio_write_update_ordered(struct inode *inode,
+                                        const u64 offset, const u64 bytes,
+                                        const bool uptodate)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_ordered_extent *ordered = NULL;
+       struct btrfs_workqueue *wq;
+       btrfs_work_func_t func;
        u64 ordered_offset = offset;
        u64 ordered_bytes = bytes;
        int ret;
 
+       if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+               wq = fs_info->endio_freespace_worker;
+               func = btrfs_freespace_write_helper;
+       } else {
+               wq = fs_info->endio_write_workers;
+               func = btrfs_endio_write_helper;
+       }
+
 again:
        ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
                                                   &ordered_offset,
@@ -8146,9 +8279,8 @@ again:
        if (!ret)
                goto out_test;
 
-       btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
-                       finish_ordered_fn, NULL, NULL);
-       btrfs_queue_work(fs_info->endio_write_workers, &ordered->work);
+       btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL);
+       btrfs_queue_work(wq, &ordered->work);
 out_test:
        /*
         * our bio might span multiple ordered extents.  If we haven't
@@ -8166,10 +8298,8 @@ static void btrfs_endio_direct_write(struct bio *bio)
        struct btrfs_dio_private *dip = bio->bi_private;
        struct bio *dio_bio = dip->dio_bio;
 
-       btrfs_endio_direct_write_update_ordered(dip->inode,
-                                               dip->logical_offset,
-                                               dip->bytes,
-                                               !bio->bi_error);
+       __endio_write_update_ordered(dip->inode, dip->logical_offset,
+                                    dip->bytes, !bio->bi_error);
 
        kfree(dip);
 
@@ -8530,10 +8660,10 @@ free_ordered:
                io_bio = NULL;
        } else {
                if (write)
-                       btrfs_endio_direct_write_update_ordered(inode,
+                       __endio_write_update_ordered(inode,
                                                file_offset,
                                                dio_bio->bi_iter.bi_size,
-                                               0);
+                                               false);
                else
                        unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
                              file_offset + dio_bio->bi_iter.bi_size - 1);
@@ -8668,11 +8798,11 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
                         */
                        if (dio_data.unsubmitted_oe_range_start <
                            dio_data.unsubmitted_oe_range_end)
-                               btrfs_endio_direct_write_update_ordered(inode,
+                               __endio_write_update_ordered(inode,
                                        dio_data.unsubmitted_oe_range_start,
                                        dio_data.unsubmitted_oe_range_end -
                                        dio_data.unsubmitted_oe_range_start,
-                                       0);
+                                       false);
                } else if (ret >= 0 && (size_t)ret < count)
                        btrfs_delalloc_release_space(inode, offset,
                                                     count - (size_t)ret);
@@ -8819,6 +8949,7 @@ again:
                if (!inode_evicting)
                        clear_extent_bit(tree, start, end,
                                         EXTENT_DIRTY | EXTENT_DELALLOC |
+                                        EXTENT_DELALLOC_NEW |
                                         EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
                                         EXTENT_DEFRAG, 1, 0, &cached_state,
                                         GFP_NOFS);
@@ -8876,8 +9007,8 @@ again:
        if (!inode_evicting) {
                clear_extent_bit(tree, page_start, page_end,
                                 EXTENT_LOCKED | EXTENT_DIRTY |
-                                EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
-                                EXTENT_DEFRAG, 1, 1,
+                                EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+                                EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
                                 &cached_state, GFP_NOFS);
 
                __btrfs_releasepage(page, GFP_NOFS);
@@ -9248,6 +9379,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->last_sub_trans = 0;
        ei->logged_trans = 0;
        ei->delalloc_bytes = 0;
+       ei->new_delalloc_bytes = 0;
        ei->defrag_bytes = 0;
        ei->disk_i_size = 0;
        ei->flags = 0;
@@ -9313,6 +9445,7 @@ void btrfs_destroy_inode(struct inode *inode)
        WARN_ON(BTRFS_I(inode)->outstanding_extents);
        WARN_ON(BTRFS_I(inode)->reserved_extents);
        WARN_ON(BTRFS_I(inode)->delalloc_bytes);
+       WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
        WARN_ON(BTRFS_I(inode)->csum_bytes);
        WARN_ON(BTRFS_I(inode)->defrag_bytes);
 
@@ -9436,7 +9569,7 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat,
        stat->dev = BTRFS_I(inode)->root->anon_dev;
 
        spin_lock(&BTRFS_I(inode)->lock);
-       delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
+       delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
        spin_unlock(&BTRFS_I(inode)->lock);
        stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
                        ALIGN(delalloc_bytes, blocksize)) >> 9;
index 922a66fce401784c6ea2458b6d84637169b4a774..e176375f374f917be5c50a46ed1c94444d1949d3 100644 (file)
@@ -1504,7 +1504,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
        if (ret)
                return ret;
 
-       if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+       if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
                mnt_drop_write_file(file);
                return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
        }
@@ -1619,7 +1619,7 @@ out_free:
        kfree(vol_args);
 out:
        mutex_unlock(&fs_info->volume_mutex);
-       atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+       clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
        mnt_drop_write_file(file);
        return ret;
 }
@@ -2661,7 +2661,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
-       if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1))
+       if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
                return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
 
        mutex_lock(&fs_info->volume_mutex);
@@ -2680,7 +2680,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
        kfree(vol_args);
 out:
        mutex_unlock(&fs_info->volume_mutex);
-       atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+       clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
        return ret;
 }
 
@@ -2708,7 +2708,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
        if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED)
                return -EOPNOTSUPP;
 
-       if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+       if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
                ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
                goto out;
        }
@@ -2721,7 +2721,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
                ret = btrfs_rm_device(fs_info, vol_args->name, 0);
        }
        mutex_unlock(&fs_info->volume_mutex);
-       atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+       clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
 
        if (!ret) {
                if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
@@ -2752,7 +2752,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
        if (ret)
                return ret;
 
-       if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+       if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
                ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
                goto out_drop_write;
        }
@@ -2772,7 +2772,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
                btrfs_info(fs_info, "disk deleted %s", vol_args->name);
        kfree(vol_args);
 out:
-       atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+       clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
 out_drop_write:
        mnt_drop_write_file(file);
 
@@ -4439,13 +4439,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
                        ret = -EROFS;
                        goto out;
                }
-               if (atomic_xchg(
-                       &fs_info->mutually_exclusive_operation_running, 1)) {
+               if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
                        ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
                } else {
                        ret = btrfs_dev_replace_by_ioctl(fs_info, p);
-                       atomic_set(
-                        &fs_info->mutually_exclusive_operation_running, 0);
+                       clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
                }
                break;
        case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
@@ -4640,7 +4638,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
                return ret;
 
 again:
-       if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+       if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
                mutex_lock(&fs_info->volume_mutex);
                mutex_lock(&fs_info->balance_mutex);
                need_unlock = true;
@@ -4686,7 +4684,7 @@ again:
        }
 
 locked:
-       BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
+       BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
 
        if (arg) {
                bargs = memdup_user(arg, sizeof(*bargs));
@@ -4742,11 +4740,10 @@ locked:
 
 do_balance:
        /*
-        * Ownership of bctl and mutually_exclusive_operation_running
+        * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP
         * goes to to btrfs_balance.  bctl is freed in __cancel_balance,
         * or, if restriper was paused all the way until unmount, in
-        * free_fs_info.  mutually_exclusive_operation_running is
-        * cleared in __cancel_balance.
+        * free_fs_info.  The flag is cleared in __cancel_balance.
         */
        need_unlock = false;
 
@@ -4766,7 +4763,7 @@ out_unlock:
        mutex_unlock(&fs_info->balance_mutex);
        mutex_unlock(&fs_info->volume_mutex);
        if (need_unlock)
-               atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+               clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
 out:
        mnt_drop_write_file(file);
        return ret;
index 9a46878ba60fa973562139f32629810b476ed453..7b40e2e7292a41a047b0f8e300304822b7c63e74 100644 (file)
@@ -212,7 +212,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
 
        /* one ref for the tree */
-       atomic_set(&entry->refs, 1);
+       refcount_set(&entry->refs, 1);
        init_waitqueue_head(&entry->wait);
        INIT_LIST_HEAD(&entry->list);
        INIT_LIST_HEAD(&entry->root_extent_list);
@@ -358,7 +358,7 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
 out:
        if (!ret && cached && entry) {
                *cached = entry;
-               atomic_inc(&entry->refs);
+               refcount_inc(&entry->refs);
        }
        spin_unlock_irqrestore(&tree->lock, flags);
        return ret == 0;
@@ -425,7 +425,7 @@ have_entry:
 out:
        if (!ret && cached && entry) {
                *cached = entry;
-               atomic_inc(&entry->refs);
+               refcount_inc(&entry->refs);
        }
        spin_unlock_irqrestore(&tree->lock, flags);
        return ret == 0;
@@ -456,7 +456,7 @@ void btrfs_get_logged_extents(struct btrfs_inode *inode,
                if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
                        continue;
                list_add(&ordered->log_list, logged_list);
-               atomic_inc(&ordered->refs);
+               refcount_inc(&ordered->refs);
        }
        spin_unlock_irq(&tree->lock);
 }
@@ -565,7 +565,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 
        trace_btrfs_ordered_extent_put(entry->inode, entry);
 
-       if (atomic_dec_and_test(&entry->refs)) {
+       if (refcount_dec_and_test(&entry->refs)) {
                ASSERT(list_empty(&entry->log_list));
                ASSERT(list_empty(&entry->trans_list));
                ASSERT(list_empty(&entry->root_extent_list));
@@ -623,7 +623,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
                spin_lock(&fs_info->trans_lock);
                trans = fs_info->running_transaction;
                if (trans)
-                       atomic_inc(&trans->use_count);
+                       refcount_inc(&trans->use_count);
                spin_unlock(&fs_info->trans_lock);
 
                ASSERT(trans);
@@ -690,7 +690,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
 
                list_move_tail(&ordered->root_extent_list,
                               &root->ordered_extents);
-               atomic_inc(&ordered->refs);
+               refcount_inc(&ordered->refs);
                spin_unlock(&root->ordered_extent_lock);
 
                btrfs_init_work(&ordered->flush_work,
@@ -870,7 +870,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
        if (!offset_in_entry(entry, file_offset))
                entry = NULL;
        if (entry)
-               atomic_inc(&entry->refs);
+               refcount_inc(&entry->refs);
 out:
        spin_unlock_irq(&tree->lock);
        return entry;
@@ -911,7 +911,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
        }
 out:
        if (entry)
-               atomic_inc(&entry->refs);
+               refcount_inc(&entry->refs);
        spin_unlock_irq(&tree->lock);
        return entry;
 }
@@ -948,7 +948,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
                goto out;
 
        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-       atomic_inc(&entry->refs);
+       refcount_inc(&entry->refs);
 out:
        spin_unlock_irq(&tree->lock);
        return entry;
index 195c93b67fe002861153fb58f6506fa93deb0434..e0c1d5b8d859c95c9e870b8feb735c921a09d8f1 100644 (file)
@@ -113,7 +113,7 @@ struct btrfs_ordered_extent {
        int compress_type;
 
        /* reference count */
-       atomic_t refs;
+       refcount_t refs;
 
        /* the inode we belong to */
        struct inode *inode;
index afbea61d957e893db09effb75ec47c7d670e24e3..deffbeb74a0be7499c42578efc51903c3737f3bf 100644 (file)
  *  - check all ioctl parameters
  */
 
-/*
- * one struct for each qgroup, organized in fs_info->qgroup_tree.
- */
-struct btrfs_qgroup {
-       u64 qgroupid;
-
-       /*
-        * state
-        */
-       u64 rfer;       /* referenced */
-       u64 rfer_cmpr;  /* referenced compressed */
-       u64 excl;       /* exclusive */
-       u64 excl_cmpr;  /* exclusive compressed */
-
-       /*
-        * limits
-        */
-       u64 lim_flags;  /* which limits are set */
-       u64 max_rfer;
-       u64 max_excl;
-       u64 rsv_rfer;
-       u64 rsv_excl;
-
-       /*
-        * reservation tracking
-        */
-       u64 reserved;
-
-       /*
-        * lists
-        */
-       struct list_head groups;  /* groups this group is member of */
-       struct list_head members; /* groups that are members of this group */
-       struct list_head dirty;   /* dirty groups */
-       struct rb_node node;      /* tree of qgroups */
-
-       /*
-        * temp variables for accounting operations
-        * Refer to qgroup_shared_accounting() for details.
-        */
-       u64 old_refcnt;
-       u64 new_refcnt;
-};
-
 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
                                           int mod)
 {
@@ -1078,6 +1034,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
        qgroup->excl += sign * num_bytes;
        qgroup->excl_cmpr += sign * num_bytes;
        if (sign > 0) {
+               trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes);
                if (qgroup->reserved < num_bytes)
                        report_reserved_underflow(fs_info, qgroup, num_bytes);
                else
@@ -1103,6 +1060,8 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
                WARN_ON(sign < 0 && qgroup->excl < num_bytes);
                qgroup->excl += sign * num_bytes;
                if (sign > 0) {
+                       trace_qgroup_update_reserve(fs_info, qgroup,
+                                                   -(s64)num_bytes);
                        if (qgroup->reserved < num_bytes)
                                report_reserved_underflow(fs_info, qgroup,
                                                          num_bytes);
@@ -2058,12 +2017,12 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
 
                if (!ret) {
                        /*
-                        * Use (u64)-1 as time_seq to do special search, which
+                        * Use SEQ_LAST as time_seq to do special search, which
                         * doesn't lock tree or delayed_refs and search current
                         * root. It's safe inside commit_transaction().
                         */
                        ret = btrfs_find_all_roots(trans, fs_info,
-                                       record->bytenr, (u64)-1, &new_roots);
+                                       record->bytenr, SEQ_LAST, &new_roots);
                        if (ret < 0)
                                goto cleanup;
                        if (qgroup_to_skip)
@@ -2370,6 +2329,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 ref_root = root->root_key.objectid;
        int ret = 0;
+       int retried = 0;
        struct ulist_node *unode;
        struct ulist_iterator uiter;
 
@@ -2378,7 +2338,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
 
        if (num_bytes == 0)
                return 0;
-
+retry:
        spin_lock(&fs_info->qgroup_lock);
        quota_root = fs_info->quota_root;
        if (!quota_root)
@@ -2405,6 +2365,27 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
                qg = unode_aux_to_qgroup(unode);
 
                if (enforce && !qgroup_check_limits(qg, num_bytes)) {
+                       /*
+                        * Commit the tree and retry, since we may have
+                        * deletions which would free up space.
+                        */
+                       if (!retried && qg->reserved > 0) {
+                               struct btrfs_trans_handle *trans;
+
+                               spin_unlock(&fs_info->qgroup_lock);
+                               ret = btrfs_start_delalloc_inodes(root, 0);
+                               if (ret)
+                                       return ret;
+                               btrfs_wait_ordered_extents(root, -1, 0, (u64)-1);
+                               trans = btrfs_join_transaction(root);
+                               if (IS_ERR(trans))
+                                       return PTR_ERR(trans);
+                               ret = btrfs_commit_transaction(trans);
+                               if (ret)
+                                       return ret;
+                               retried++;
+                               goto retry;
+                       }
                        ret = -EDQUOT;
                        goto out;
                }
@@ -2427,6 +2408,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
 
                qg = unode_aux_to_qgroup(unode);
 
+               trace_qgroup_update_reserve(fs_info, qg, num_bytes);
                qg->reserved += num_bytes;
        }
 
@@ -2472,6 +2454,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 
                qg = unode_aux_to_qgroup(unode);
 
+               trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes);
                if (qg->reserved < num_bytes)
                        report_reserved_underflow(fs_info, qg, num_bytes);
                else
@@ -2490,18 +2473,6 @@ out:
        spin_unlock(&fs_info->qgroup_lock);
 }
 
-void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
-{
-       if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
-               return;
-       btrfs_err(trans->fs_info,
-               "qgroups not uptodate in trans handle %p:  list is%s empty, seq is %#x.%x",
-               trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
-               (u32)(trans->delayed_ref_elem.seq >> 32),
-               (u32)trans->delayed_ref_elem.seq);
-       BUG();
-}
-
 /*
  * returns < 0 on error, 0 when more leafs are to be scanned.
  * returns 1 when done.
@@ -2889,14 +2860,14 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
        if (ret < 0)
                goto out;
 
-       if (free) {
-               btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
-                               BTRFS_I(inode)->root->objectid,
-                               changeset.bytes_changed);
+       if (free)
                trace_op = QGROUP_FREE;
-       }
        trace_btrfs_qgroup_release_data(inode, start, len,
                                        changeset.bytes_changed, trace_op);
+       if (free)
+               btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
+                               BTRFS_I(inode)->root->objectid,
+                               changeset.bytes_changed);
 out:
        ulist_release(&changeset.range_changed);
        return ret;
@@ -2948,6 +2919,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
                return 0;
 
        BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
+       trace_qgroup_meta_reserve(root, (s64)num_bytes);
        ret = qgroup_reserve(root, num_bytes, enforce);
        if (ret < 0)
                return ret;
@@ -2967,6 +2939,7 @@ void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
        reserved = atomic64_xchg(&root->qgroup_meta_rsv, 0);
        if (reserved == 0)
                return;
+       trace_qgroup_meta_reserve(root, -(s64)reserved);
        btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved);
 }
 
@@ -2981,6 +2954,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
        BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
        WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes);
        atomic64_sub(num_bytes, &root->qgroup_meta_rsv);
+       trace_qgroup_meta_reserve(root, -(s64)num_bytes);
        btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes);
 }
 
index 26932a8a19930bc48ff14ad3c154a7326ce3a111..fe04d3f295c67f0b97b79095885878237260a274 100644 (file)
@@ -61,6 +61,50 @@ struct btrfs_qgroup_extent_record {
        struct ulist *old_roots;
 };
 
+/*
+ * one struct for each qgroup, organized in fs_info->qgroup_tree.
+ */
+struct btrfs_qgroup {
+       u64 qgroupid;
+
+       /*
+        * state
+        */
+       u64 rfer;       /* referenced */
+       u64 rfer_cmpr;  /* referenced compressed */
+       u64 excl;       /* exclusive */
+       u64 excl_cmpr;  /* exclusive compressed */
+
+       /*
+        * limits
+        */
+       u64 lim_flags;  /* which limits are set */
+       u64 max_rfer;
+       u64 max_excl;
+       u64 rsv_rfer;
+       u64 rsv_excl;
+
+       /*
+        * reservation tracking
+        */
+       u64 reserved;
+
+       /*
+        * lists
+        */
+       struct list_head groups;  /* groups this group is member of */
+       struct list_head members; /* groups that are members of this group */
+       struct list_head dirty;   /* dirty groups */
+       struct rb_node node;      /* tree of qgroups */
+
+       /*
+        * temp variables for accounting operations
+        * Refer to qgroup_shared_accounting() for details.
+        */
+       u64 old_refcnt;
+       u64 new_refcnt;
+};
+
 /*
  * For qgroup event trace points only
  */
@@ -186,17 +230,12 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
                         struct btrfs_qgroup_inherit *inherit);
 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
                               u64 ref_root, u64 num_bytes);
-/*
- * TODO: Add proper trace point for it, as btrfs_qgroup_free() is
- * called by everywhere, can't provide good trace for delayed ref case.
- */
 static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
                                                 u64 ref_root, u64 num_bytes)
 {
-       btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
        trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
+       btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
 }
-void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
index 1571bf26dc077a0575b39977ead28e229d68550f..d8ea0eb76325e9b25d42dfa4a99c63918981aa2f 100644 (file)
@@ -149,7 +149,7 @@ struct btrfs_raid_bio {
 
        int generic_bio_cnt;
 
-       atomic_t refs;
+       refcount_t refs;
 
        atomic_t stripes_pending;
 
@@ -389,7 +389,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
                if (bio_list_empty(&rbio->bio_list)) {
                        if (!list_empty(&rbio->hash_list)) {
                                list_del_init(&rbio->hash_list);
-                               atomic_dec(&rbio->refs);
+                               refcount_dec(&rbio->refs);
                                BUG_ON(!list_empty(&rbio->plug_list));
                        }
                }
@@ -480,7 +480,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
 
        /* bump our ref if we were not in the list before */
        if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
-               atomic_inc(&rbio->refs);
+               refcount_inc(&rbio->refs);
 
        if (!list_empty(&rbio->stripe_cache)){
                list_move(&rbio->stripe_cache, &table->stripe_cache);
@@ -689,7 +689,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
                            test_bit(RBIO_CACHE_BIT, &cur->flags) &&
                            !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
                                list_del_init(&cur->hash_list);
-                               atomic_dec(&cur->refs);
+                               refcount_dec(&cur->refs);
 
                                steal_rbio(cur, rbio);
                                cache_drop = cur;
@@ -738,7 +738,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
                }
        }
 lockit:
-       atomic_inc(&rbio->refs);
+       refcount_inc(&rbio->refs);
        list_add(&rbio->hash_list, &h->hash_list);
 out:
        spin_unlock_irqrestore(&h->lock, flags);
@@ -784,7 +784,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
                }
 
                list_del_init(&rbio->hash_list);
-               atomic_dec(&rbio->refs);
+               refcount_dec(&rbio->refs);
 
                /*
                 * we use the plug list to hold all the rbios
@@ -801,7 +801,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
                        list_del_init(&rbio->plug_list);
 
                        list_add(&next->hash_list, &h->hash_list);
-                       atomic_inc(&next->refs);
+                       refcount_inc(&next->refs);
                        spin_unlock(&rbio->bio_list_lock);
                        spin_unlock_irqrestore(&h->lock, flags);
 
@@ -843,8 +843,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 {
        int i;
 
-       WARN_ON(atomic_read(&rbio->refs) < 0);
-       if (!atomic_dec_and_test(&rbio->refs))
+       if (!refcount_dec_and_test(&rbio->refs))
                return;
 
        WARN_ON(!list_empty(&rbio->stripe_cache));
@@ -997,7 +996,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
        rbio->stripe_npages = stripe_npages;
        rbio->faila = -1;
        rbio->failb = -1;
-       atomic_set(&rbio->refs, 1);
+       refcount_set(&rbio->refs, 1);
        atomic_set(&rbio->error, 0);
        atomic_set(&rbio->stripes_pending, 0);
 
@@ -2118,6 +2117,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
        struct btrfs_raid_bio *rbio;
        int ret;
 
+       if (generic_io) {
+               ASSERT(bbio->mirror_num == mirror_num);
+               btrfs_io_bio(bio)->mirror_num = mirror_num;
+       }
+
        rbio = alloc_rbio(fs_info, bbio, stripe_len);
        if (IS_ERR(rbio)) {
                if (generic_io)
@@ -2194,6 +2198,8 @@ static void read_rebuild_work(struct btrfs_work *work)
 /*
  * The following code is used to scrub/replace the parity stripe
  *
+ * Caller must have already increased bio_counter for getting @bbio.
+ *
  * Note: We need make sure all the pages that add into the scrub/replace
  * raid bio are correct and not be changed during the scrub/replace. That
  * is those pages just hold metadata or file data with checksum.
@@ -2231,6 +2237,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
        ASSERT(rbio->stripe_npages == stripe_nsectors);
        bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
 
+       /*
+        * We have already increased bio_counter when getting bbio, record it
+        * so we can free it at rbio_orig_end_io().
+        */
+       rbio->generic_bio_cnt = 1;
+
        return rbio;
 }
 
@@ -2673,6 +2685,12 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
                return NULL;
        }
 
+       /*
+        * When we get bbio, we have already increased bio_counter, record it
+        * so we can free it at rbio_orig_end_io()
+        */
+       rbio->generic_bio_cnt = 1;
+
        return rbio;
 }
 
index e88bca87f5d275c7c25b2ee6279fe4cdc1482413..a17e775a4a89fca1e48214c5abb02756dd1c69d6 100644 (file)
@@ -209,9 +209,9 @@ cleanup:
        return;
 }
 
-int btree_readahead_hook(struct btrfs_fs_info *fs_info,
-                        struct extent_buffer *eb, int err)
+int btree_readahead_hook(struct extent_buffer *eb, int err)
 {
+       struct btrfs_fs_info *fs_info = eb->fs_info;
        int ret = 0;
        struct reada_extent *re;
 
@@ -235,10 +235,10 @@ start_machine:
        return ret;
 }
 
-static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
-                                         struct btrfs_device *dev, u64 logical,
+static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
                                          struct btrfs_bio *bbio)
 {
+       struct btrfs_fs_info *fs_info = dev->fs_info;
        int ret;
        struct reada_zone *zone;
        struct btrfs_block_group_cache *cache = NULL;
@@ -270,6 +270,12 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
        if (!zone)
                return NULL;
 
+       ret = radix_tree_preload(GFP_KERNEL);
+       if (ret) {
+               kfree(zone);
+               return NULL;
+       }
+
        zone->start = start;
        zone->end = end;
        INIT_LIST_HEAD(&zone->list);
@@ -299,6 +305,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
                        zone = NULL;
        }
        spin_unlock(&fs_info->reada_lock);
+       radix_tree_preload_end();
 
        return zone;
 }
@@ -313,7 +320,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
        struct btrfs_bio *bbio = NULL;
        struct btrfs_device *dev;
        struct btrfs_device *prev_dev;
-       u32 blocksize;
        u64 length;
        int real_stripes;
        int nzones = 0;
@@ -334,7 +340,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
        if (!re)
                return NULL;
 
-       blocksize = fs_info->nodesize;
        re->logical = logical;
        re->top = *top;
        INIT_LIST_HEAD(&re->extctl);
@@ -344,10 +349,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
        /*
         * map block
         */
-       length = blocksize;
+       length = fs_info->nodesize;
        ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
                        &length, &bbio, 0);
-       if (ret || !bbio || length < blocksize)
+       if (ret || !bbio || length < fs_info->nodesize)
                goto error;
 
        if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
@@ -367,7 +372,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
                 if (!dev->bdev)
                        continue;
 
-               zone = reada_find_zone(fs_info, dev, logical, bbio);
+               zone = reada_find_zone(dev, logical, bbio);
                if (!zone)
                        continue;
 
@@ -386,6 +391,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
                goto error;
        }
 
+       ret = radix_tree_preload(GFP_KERNEL);
+       if (ret)
+               goto error;
+
        /* insert extent in reada_tree + all per-device trees, all or nothing */
        btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
        spin_lock(&fs_info->reada_lock);
@@ -395,13 +404,16 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
                re_exist->refcnt++;
                spin_unlock(&fs_info->reada_lock);
                btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+               radix_tree_preload_end();
                goto error;
        }
        if (ret) {
                spin_unlock(&fs_info->reada_lock);
                btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+               radix_tree_preload_end();
                goto error;
        }
+       radix_tree_preload_end();
        prev_dev = NULL;
        dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
                        &fs_info->dev_replace);
@@ -639,9 +651,9 @@ static int reada_pick_zone(struct btrfs_device *dev)
        return 1;
 }
 
-static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
-                                  struct btrfs_device *dev)
+static int reada_start_machine_dev(struct btrfs_device *dev)
 {
+       struct btrfs_fs_info *fs_info = dev->fs_info;
        struct reada_extent *re = NULL;
        int mirror_num = 0;
        struct extent_buffer *eb = NULL;
@@ -754,8 +766,7 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
                list_for_each_entry(device, &fs_devices->devices, dev_list) {
                        if (atomic_read(&device->reada_in_flight) <
                            MAX_IN_FLIGHT)
-                               enqueued += reada_start_machine_dev(fs_info,
-                                                                   device);
+                               enqueued += reada_start_machine_dev(device);
                }
                mutex_unlock(&fs_devices->device_list_mutex);
                total += enqueued;
index a08224eab8b47111b2fc6f7f51ac33e6ca69814e..7d6bc308bf4308f653cc300c1354602ddd0df911 100644 (file)
@@ -501,8 +501,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
        struct btrfs_root_item *item = &root->root_item;
-       struct timespec ct = current_fs_time(root->fs_info->sb);
+       struct timespec ct;
 
+       ktime_get_real_ts(&ct);
        spin_lock(&root->root_item_lock);
        btrfs_set_root_ctransid(item, trans->transid);
        btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec);
index b0251eb1239fce83226650be88c31122a9f108af..c7b45eb2403d09e94b2538dabcb5a1f0116c55dd 100644 (file)
@@ -64,7 +64,7 @@ struct scrub_ctx;
 #define SCRUB_MAX_PAGES_PER_BLOCK      16      /* 64k per node/leaf/sector */
 
 struct scrub_recover {
-       atomic_t                refs;
+       refcount_t              refs;
        struct btrfs_bio        *bbio;
        u64                     map_length;
 };
@@ -112,7 +112,7 @@ struct scrub_block {
        struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
        int                     page_count;
        atomic_t                outstanding_pages;
-       atomic_t                refs; /* free mem on transition to zero */
+       refcount_t              refs; /* free mem on transition to zero */
        struct scrub_ctx        *sctx;
        struct scrub_parity     *sparity;
        struct {
@@ -140,9 +140,9 @@ struct scrub_parity {
 
        int                     nsectors;
 
-       int                     stripe_len;
+       u64                     stripe_len;
 
-       atomic_t                refs;
+       refcount_t              refs;
 
        struct list_head        spages;
 
@@ -202,7 +202,7 @@ struct scrub_ctx {
         * doesn't free the scrub context before or while the workers are
         * doing the wakeup() call.
         */
-       atomic_t                refs;
+       refcount_t              refs;
 };
 
 struct scrub_fixup_nodatasum {
@@ -240,6 +240,13 @@ struct scrub_warning {
        struct btrfs_device     *dev;
 };
 
+struct full_stripe_lock {
+       struct rb_node node;
+       u64 logical;
+       u64 refs;
+       struct mutex mutex;
+};
+
 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
@@ -305,7 +312,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx);
 
 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 {
-       atomic_inc(&sctx->refs);
+       refcount_inc(&sctx->refs);
        atomic_inc(&sctx->bios_in_flight);
 }
 
@@ -348,6 +355,222 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
        scrub_pause_off(fs_info);
 }
 
+/*
+ * Insert new full stripe lock into full stripe locks tree
+ *
+ * Return pointer to existing or newly inserted full_stripe_lock structure if
+ * everything works well.
+ * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
+ *
+ * NOTE: caller must hold full_stripe_locks_root->lock before calling this
+ * function
+ */
+static struct full_stripe_lock *insert_full_stripe_lock(
+               struct btrfs_full_stripe_locks_tree *locks_root,
+               u64 fstripe_logical)
+{
+       struct rb_node **p;
+       struct rb_node *parent = NULL;
+       struct full_stripe_lock *entry;
+       struct full_stripe_lock *ret;
+
+       WARN_ON(!mutex_is_locked(&locks_root->lock));
+
+       p = &locks_root->root.rb_node;
+       while (*p) {
+               parent = *p;
+               entry = rb_entry(parent, struct full_stripe_lock, node);
+               if (fstripe_logical < entry->logical) {
+                       p = &(*p)->rb_left;
+               } else if (fstripe_logical > entry->logical) {
+                       p = &(*p)->rb_right;
+               } else {
+                       entry->refs++;
+                       return entry;
+               }
+       }
+
+       /* Insert new lock */
+       ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+       if (!ret)
+               return ERR_PTR(-ENOMEM);
+       ret->logical = fstripe_logical;
+       ret->refs = 1;
+       mutex_init(&ret->mutex);
+
+       rb_link_node(&ret->node, parent, p);
+       rb_insert_color(&ret->node, &locks_root->root);
+       return ret;
+}
+
+/*
+ * Search for a full stripe lock of a block group
+ *
+ * Return pointer to existing full stripe lock if found
+ * Return NULL if not found
+ */
+static struct full_stripe_lock *search_full_stripe_lock(
+               struct btrfs_full_stripe_locks_tree *locks_root,
+               u64 fstripe_logical)
+{
+       struct rb_node *node;
+       struct full_stripe_lock *entry;
+
+       WARN_ON(!mutex_is_locked(&locks_root->lock));
+
+       node = locks_root->root.rb_node;
+       while (node) {
+               entry = rb_entry(node, struct full_stripe_lock, node);
+               if (fstripe_logical < entry->logical)
+                       node = node->rb_left;
+               else if (fstripe_logical > entry->logical)
+                       node = node->rb_right;
+               else
+                       return entry;
+       }
+       return NULL;
+}
+
+/*
+ * Helper to get full stripe logical from a normal bytenr.
+ *
+ * Caller must ensure @cache is a RAID56 block group.
+ */
+static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
+                                  u64 bytenr)
+{
+       u64 ret;
+
+       /*
+        * Due to chunk item size limit, full stripe length should not be
+        * larger than U32_MAX. Just a sanity check here.
+        */
+       WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
+
+       /*
+        * round_down() can only handle power of 2, while RAID56 full
+        * stripe length can be 64KiB * n, so we need to manually round down.
+        */
+       ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
+               cache->full_stripe_len + cache->key.objectid;
+       return ret;
+}
+
+/*
+ * Lock a full stripe to avoid concurrency of recovery and read
+ *
+ * It's only used for profiles with parities (RAID5/6), for other profiles it
+ * does nothing.
+ *
+ * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
+ * So caller must call unlock_full_stripe() at the same context.
+ *
+ * Return <0 if encounters error.
+ */
+static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
+                           bool *locked_ret)
+{
+       struct btrfs_block_group_cache *bg_cache;
+       struct btrfs_full_stripe_locks_tree *locks_root;
+       struct full_stripe_lock *existing;
+       u64 fstripe_start;
+       int ret = 0;
+
+       *locked_ret = false;
+       bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+       if (!bg_cache) {
+               ASSERT(0);
+               return -ENOENT;
+       }
+
+       /* Profiles not based on parity don't need full stripe lock */
+       if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+               goto out;
+       locks_root = &bg_cache->full_stripe_locks_root;
+
+       fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
+
+       /* Now insert the full stripe lock */
+       mutex_lock(&locks_root->lock);
+       existing = insert_full_stripe_lock(locks_root, fstripe_start);
+       mutex_unlock(&locks_root->lock);
+       if (IS_ERR(existing)) {
+               ret = PTR_ERR(existing);
+               goto out;
+       }
+       mutex_lock(&existing->mutex);
+       *locked_ret = true;
+out:
+       btrfs_put_block_group(bg_cache);
+       return ret;
+}
+
+/*
+ * Unlock a full stripe.
+ *
+ * NOTE: Caller must ensure it's the same context calling corresponding
+ * lock_full_stripe().
+ *
+ * Return 0 if we unlock full stripe without problem.
+ * Return <0 for error
+ */
+static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
+                             bool locked)
+{
+       struct btrfs_block_group_cache *bg_cache;
+       struct btrfs_full_stripe_locks_tree *locks_root;
+       struct full_stripe_lock *fstripe_lock;
+       u64 fstripe_start;
+       bool freeit = false;
+       int ret = 0;
+
+       /* If we didn't acquire full stripe lock, no need to continue */
+       if (!locked)
+               return 0;
+
+       bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+       if (!bg_cache) {
+               ASSERT(0);
+               return -ENOENT;
+       }
+       if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+               goto out;
+
+       locks_root = &bg_cache->full_stripe_locks_root;
+       fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
+
+       mutex_lock(&locks_root->lock);
+       fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
+       /* Unpaired unlock_full_stripe() detected */
+       if (!fstripe_lock) {
+               WARN_ON(1);
+               ret = -ENOENT;
+               mutex_unlock(&locks_root->lock);
+               goto out;
+       }
+
+       if (fstripe_lock->refs == 0) {
+               WARN_ON(1);
+               btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
+                       fstripe_lock->logical);
+       } else {
+               fstripe_lock->refs--;
+       }
+
+       if (fstripe_lock->refs == 0) {
+               rb_erase(&fstripe_lock->node, &locks_root->root);
+               freeit = true;
+       }
+       mutex_unlock(&locks_root->lock);
+
+       mutex_unlock(&fstripe_lock->mutex);
+       if (freeit)
+               kfree(fstripe_lock);
+out:
+       btrfs_put_block_group(bg_cache);
+       return ret;
+}
+
 /*
  * used for workers that require transaction commits (i.e., for the
  * NOCOW case)
@@ -356,7 +579,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 {
        struct btrfs_fs_info *fs_info = sctx->fs_info;
 
-       atomic_inc(&sctx->refs);
+       refcount_inc(&sctx->refs);
        /*
         * increment scrubs_running to prevent cancel requests from
         * completing as long as a worker is running. we must also
@@ -447,7 +670,7 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 
 static void scrub_put_ctx(struct scrub_ctx *sctx)
 {
-       if (atomic_dec_and_test(&sctx->refs))
+       if (refcount_dec_and_test(&sctx->refs))
                scrub_free_ctx(sctx);
 }
 
@@ -462,7 +685,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
        sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
        if (!sctx)
                goto nomem;
-       atomic_set(&sctx->refs, 1);
+       refcount_set(&sctx->refs, 1);
        sctx->is_dev_replace = is_dev_replace;
        sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
        sctx->curr = -1;
@@ -857,12 +1080,14 @@ out:
 
 static inline void scrub_get_recover(struct scrub_recover *recover)
 {
-       atomic_inc(&recover->refs);
+       refcount_inc(&recover->refs);
 }
 
-static inline void scrub_put_recover(struct scrub_recover *recover)
+static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
+                                    struct scrub_recover *recover)
 {
-       if (atomic_dec_and_test(&recover->refs)) {
+       if (refcount_dec_and_test(&recover->refs)) {
+               btrfs_bio_counter_dec(fs_info);
                btrfs_put_bbio(recover->bbio);
                kfree(recover);
        }
@@ -892,6 +1117,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
        int mirror_index;
        int page_num;
        int success;
+       bool full_stripe_locked;
        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
                                      DEFAULT_RATELIMIT_BURST);
 
@@ -917,6 +1143,24 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
        have_csum = sblock_to_check->pagev[0]->have_csum;
        dev = sblock_to_check->pagev[0]->dev;
 
+       /*
+        * For RAID5/6, race can happen for a different device scrub thread.
+        * For data corruption, Parity and Data threads will both try
+        * to recovery the data.
+        * Race can lead to doubly added csum error, or even unrecoverable
+        * error.
+        */
+       ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
+       if (ret < 0) {
+               spin_lock(&sctx->stat_lock);
+               if (ret == -ENOMEM)
+                       sctx->stat.malloc_errors++;
+               sctx->stat.read_errors++;
+               sctx->stat.uncorrectable_errors++;
+               spin_unlock(&sctx->stat_lock);
+               return ret;
+       }
+
        if (sctx->is_dev_replace && !is_metadata && !have_csum) {
                sblocks_for_recheck = NULL;
                goto nodatasum_case;
@@ -1241,7 +1485,7 @@ out:
                                sblock->pagev[page_index]->sblock = NULL;
                                recover = sblock->pagev[page_index]->recover;
                                if (recover) {
-                                       scrub_put_recover(recover);
+                                       scrub_put_recover(fs_info, recover);
                                        sblock->pagev[page_index]->recover =
                                                                        NULL;
                                }
@@ -1251,6 +1495,9 @@ out:
                kfree(sblocks_for_recheck);
        }
 
+       ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
+       if (ret < 0)
+               return ret;
        return 0;
 }
 
@@ -1330,20 +1577,23 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
                 * with a length of PAGE_SIZE, each returned stripe
                 * represents one mirror
                 */
+               btrfs_bio_counter_inc_blocked(fs_info);
                ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
-                               logical, &mapped_length, &bbio, 0, 1);
+                               logical, &mapped_length, &bbio);
                if (ret || !bbio || mapped_length < sublen) {
                        btrfs_put_bbio(bbio);
+                       btrfs_bio_counter_dec(fs_info);
                        return -EIO;
                }
 
                recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
                if (!recover) {
                        btrfs_put_bbio(bbio);
+                       btrfs_bio_counter_dec(fs_info);
                        return -ENOMEM;
                }
 
-               atomic_set(&recover->refs, 1);
+               refcount_set(&recover->refs, 1);
                recover->bbio = bbio;
                recover->map_length = mapped_length;
 
@@ -1365,7 +1615,7 @@ leave_nomem:
                                spin_lock(&sctx->stat_lock);
                                sctx->stat.malloc_errors++;
                                spin_unlock(&sctx->stat_lock);
-                               scrub_put_recover(recover);
+                               scrub_put_recover(fs_info, recover);
                                return -ENOMEM;
                        }
                        scrub_page_get(page);
@@ -1407,7 +1657,7 @@ leave_nomem:
                        scrub_get_recover(recover);
                        page->recover = recover;
                }
-               scrub_put_recover(recover);
+               scrub_put_recover(fs_info, recover);
                length -= sublen;
                logical += sublen;
                page_index++;
@@ -1497,14 +1747,18 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 
                bio_add_page(bio, page->page, PAGE_SIZE, 0);
                if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
-                       if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
+                       if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) {
+                               page->io_error = 1;
                                sblock->no_io_error_seen = 0;
+                       }
                } else {
                        bio->bi_iter.bi_sector = page->physical >> 9;
                        bio_set_op_attrs(bio, REQ_OP_READ, 0);
 
-                       if (btrfsic_submit_bio_wait(bio))
+                       if (btrfsic_submit_bio_wait(bio)) {
+                               page->io_error = 1;
                                sblock->no_io_error_seen = 0;
+                       }
                }
 
                bio_put(bio);
@@ -1634,7 +1888,7 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
        if (spage->io_error) {
                void *mapped_buffer = kmap_atomic(spage->page);
 
-               memset(mapped_buffer, 0, PAGE_SIZE);
+               clear_page(mapped_buffer);
                flush_dcache_page(spage->page);
                kunmap_atomic(mapped_buffer);
        }
@@ -1998,12 +2252,12 @@ static int scrub_checksum_super(struct scrub_block *sblock)
 
 static void scrub_block_get(struct scrub_block *sblock)
 {
-       atomic_inc(&sblock->refs);
+       refcount_inc(&sblock->refs);
 }
 
 static void scrub_block_put(struct scrub_block *sblock)
 {
-       if (atomic_dec_and_test(&sblock->refs)) {
+       if (refcount_dec_and_test(&sblock->refs)) {
                int i;
 
                if (sblock->sparity)
@@ -2187,8 +2441,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
        int ret;
        int i;
 
+       btrfs_bio_counter_inc_blocked(fs_info);
        ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
-                       &length, &bbio, 0, 1);
+                       &length, &bbio);
        if (ret || !bbio || !bbio->raid_map)
                goto bbio_out;
 
@@ -2231,6 +2486,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
 rbio_out:
        bio_put(bio);
 bbio_out:
+       btrfs_bio_counter_dec(fs_info);
        btrfs_put_bbio(bbio);
        spin_lock(&sctx->stat_lock);
        sctx->stat.malloc_errors++;
@@ -2255,7 +2511,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 
        /* one ref inside this function, plus one for each page added to
         * a bio later on */
-       atomic_set(&sblock->refs, 1);
+       refcount_set(&sblock->refs, 1);
        sblock->sctx = sctx;
        sblock->no_io_error_seen = 1;
 
@@ -2385,7 +2641,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
                                       unsigned long *bitmap,
                                       u64 start, u64 len)
 {
-       u32 offset;
+       u64 offset;
        int nsectors;
        int sectorsize = sparity->sctx->fs_info->sectorsize;
 
@@ -2395,8 +2651,8 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
        }
 
        start -= sparity->logic_start;
-       start = div_u64_rem(start, sparity->stripe_len, &offset);
-       offset /= sectorsize;
+       start = div64_u64_rem(start, sparity->stripe_len, &offset);
+       offset = div_u64(offset, sectorsize);
        nsectors = (int)len / sectorsize;
 
        if (offset + nsectors <= sparity->nsectors) {
@@ -2555,7 +2811,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
 
        /* one ref inside this function, plus one for each page added to
         * a bio later on */
-       atomic_set(&sblock->refs, 1);
+       refcount_set(&sblock->refs, 1);
        sblock->sctx = sctx;
        sblock->no_io_error_seen = 1;
        sblock->sparity = sparity;
@@ -2694,7 +2950,7 @@ static int get_raid56_logic_offset(u64 physical, int num,
        for (i = 0; i < nr_data_stripes(map); i++) {
                *offset = last_offset + i * map->stripe_len;
 
-               stripe_nr = div_u64(*offset, map->stripe_len);
+               stripe_nr = div64_u64(*offset, map->stripe_len);
                stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
 
                /* Work out the disk rotation on this stripe-set */
@@ -2765,7 +3021,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
        struct btrfs_fs_info *fs_info = sctx->fs_info;
        struct bio *bio;
        struct btrfs_raid_bio *rbio;
-       struct scrub_page *spage;
        struct btrfs_bio *bbio = NULL;
        u64 length;
        int ret;
@@ -2775,8 +3030,10 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
                goto out;
 
        length = sparity->logic_end - sparity->logic_start;
+
+       btrfs_bio_counter_inc_blocked(fs_info);
        ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
-                              &length, &bbio, 0, 1);
+                              &length, &bbio);
        if (ret || !bbio || !bbio->raid_map)
                goto bbio_out;
 
@@ -2795,9 +3052,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
        if (!rbio)
                goto rbio_out;
 
-       list_for_each_entry(spage, &sparity->spages, list)
-               raid56_add_scrub_pages(rbio, spage->page, spage->logical);
-
        scrub_pending_bio_inc(sctx);
        raid56_parity_submit_scrub_rbio(rbio);
        return;
@@ -2805,6 +3059,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
 rbio_out:
        bio_put(bio);
 bbio_out:
+       btrfs_bio_counter_dec(fs_info);
        btrfs_put_bbio(bbio);
        bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
                  sparity->nsectors);
@@ -2822,12 +3077,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors)
 
 static void scrub_parity_get(struct scrub_parity *sparity)
 {
-       atomic_inc(&sparity->refs);
+       refcount_inc(&sparity->refs);
 }
 
 static void scrub_parity_put(struct scrub_parity *sparity)
 {
-       if (!atomic_dec_and_test(&sparity->refs))
+       if (!refcount_dec_and_test(&sparity->refs))
                return;
 
        scrub_parity_check_and_repair(sparity);
@@ -2879,7 +3134,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
        sparity->scrub_dev = sdev;
        sparity->logic_start = logic_start;
        sparity->logic_end = logic_end;
-       atomic_set(&sparity->refs, 1);
+       refcount_set(&sparity->refs, 1);
        INIT_LIST_HEAD(&sparity->spages);
        sparity->dbitmap = sparity->bitmap;
        sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
@@ -3098,7 +3353,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 
        physical = map->stripes[num].physical;
        offset = 0;
-       nstripes = div_u64(length, map->stripe_len);
+       nstripes = div64_u64(length, map->stripe_len);
        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
                offset = map->stripe_len * num;
                increment = map->stripe_len * map->num_stripes;
index 3f645cd67b540ac3b066a2727bd012c87895247d..fc496a6f842a87d3544e80b3b9e063d57417f788 100644 (file)
@@ -5184,13 +5184,19 @@ static int is_extent_unchanged(struct send_ctx *sctx,
        while (key.offset < ekey->offset + left_len) {
                ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
                right_type = btrfs_file_extent_type(eb, ei);
-               if (right_type != BTRFS_FILE_EXTENT_REG) {
+               if (right_type != BTRFS_FILE_EXTENT_REG &&
+                   right_type != BTRFS_FILE_EXTENT_INLINE) {
                        ret = 0;
                        goto out;
                }
 
                right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
-               right_len = btrfs_file_extent_num_bytes(eb, ei);
+               if (right_type == BTRFS_FILE_EXTENT_INLINE) {
+                       right_len = btrfs_file_extent_inline_len(eb, slot, ei);
+                       right_len = PAGE_ALIGN(right_len);
+               } else {
+                       right_len = btrfs_file_extent_num_bytes(eb, ei);
+               }
                right_offset = btrfs_file_extent_offset(eb, ei);
                right_gen = btrfs_file_extent_generation(eb, ei);
 
@@ -5204,6 +5210,19 @@ static int is_extent_unchanged(struct send_ctx *sctx,
                        goto out;
                }
 
+               /*
+                * We just wanted to see if when we have an inline extent, what
+                * follows it is a regular extent (wanted to check the above
+                * condition for inline extents too). This should normally not
+                * happen but it's possible for example when we have an inline
+                * compressed extent representing data with a size matching
+                * the page size (currently the same as sector size).
+                */
+               if (right_type == BTRFS_FILE_EXTENT_INLINE) {
+                       ret = 0;
+                       goto out;
+               }
+
                left_offset_fixed = left_offset;
                if (key.offset < ekey->offset) {
                        /* Fix the right offset for 2a and 7. */
index 72a053c9a7f097bfc9086ed6e0035e62dfd76bb1..4f1cdd5058f12b9970b5894828498624a28c77b5 100644 (file)
@@ -1795,8 +1795,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                }
 
                if (fs_info->fs_devices->missing_devices >
-                    fs_info->num_tolerated_disk_barrier_failures &&
-                   !(*flags & MS_RDONLY)) {
+                    fs_info->num_tolerated_disk_barrier_failures) {
                        btrfs_warn(fs_info,
                                "too many missing devices, writeable remount is not allowed");
                        ret = -EACCES;
index ea272432c9305177a1059db321f8aae6f89a09f3..b18ab8f327a53dd10b09bee6169f34ecff5eb809 100644 (file)
@@ -237,7 +237,6 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans)
 {
        memset(trans, 0, sizeof(*trans));
        trans->transid = 1;
-       INIT_LIST_HEAD(&trans->qgroup_ref_list);
        trans->type = __TRANS_DUMMY;
 }
 
index 61b807de3e164e38877230cdfd1e9a545573f1be..2168654c90a1e6cab355d8270b23db68de0253c3 100644 (file)
@@ -60,8 +60,8 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
 
 void btrfs_put_transaction(struct btrfs_transaction *transaction)
 {
-       WARN_ON(atomic_read(&transaction->use_count) == 0);
-       if (atomic_dec_and_test(&transaction->use_count)) {
+       WARN_ON(refcount_read(&transaction->use_count) == 0);
+       if (refcount_dec_and_test(&transaction->use_count)) {
                BUG_ON(!list_empty(&transaction->list));
                WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
                if (transaction->delayed_refs.pending_csums)
@@ -207,7 +207,7 @@ loop:
                        spin_unlock(&fs_info->trans_lock);
                        return -EBUSY;
                }
-               atomic_inc(&cur_trans->use_count);
+               refcount_inc(&cur_trans->use_count);
                atomic_inc(&cur_trans->num_writers);
                extwriter_counter_inc(cur_trans, type);
                spin_unlock(&fs_info->trans_lock);
@@ -257,7 +257,7 @@ loop:
         * One for this trans handle, one so it will live on until we
         * commit the transaction.
         */
-       atomic_set(&cur_trans->use_count, 2);
+       refcount_set(&cur_trans->use_count, 2);
        atomic_set(&cur_trans->pending_ordered, 0);
        cur_trans->flags = 0;
        cur_trans->start_time = get_seconds();
@@ -432,7 +432,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info)
        spin_lock(&fs_info->trans_lock);
        cur_trans = fs_info->running_transaction;
        if (cur_trans && is_transaction_blocked(cur_trans)) {
-               atomic_inc(&cur_trans->use_count);
+               refcount_inc(&cur_trans->use_count);
                spin_unlock(&fs_info->trans_lock);
 
                wait_event(fs_info->transaction_wait,
@@ -572,7 +572,6 @@ again:
 
        h->type = type;
        h->can_flush_pending_bgs = true;
-       INIT_LIST_HEAD(&h->qgroup_ref_list);
        INIT_LIST_HEAD(&h->new_bgs);
 
        smp_mb();
@@ -744,7 +743,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
                list_for_each_entry(t, &fs_info->trans_list, list) {
                        if (t->transid == transid) {
                                cur_trans = t;
-                               atomic_inc(&cur_trans->use_count);
+                               refcount_inc(&cur_trans->use_count);
                                ret = 0;
                                break;
                        }
@@ -773,7 +772,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
                                if (t->state == TRANS_STATE_COMPLETED)
                                        break;
                                cur_trans = t;
-                               atomic_inc(&cur_trans->use_count);
+                               refcount_inc(&cur_trans->use_count);
                                break;
                        }
                }
@@ -917,7 +916,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                wake_up_process(info->transaction_kthread);
                err = -EIO;
        }
-       assert_qgroups_uptodate(trans);
 
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
        if (must_run_delayed_refs) {
@@ -1839,7 +1837,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 
        /* take transaction reference */
        cur_trans = trans->transaction;
-       atomic_inc(&cur_trans->use_count);
+       refcount_inc(&cur_trans->use_count);
 
        btrfs_end_transaction(trans);
 
@@ -2015,7 +2013,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
        spin_lock(&fs_info->trans_lock);
        if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
                spin_unlock(&fs_info->trans_lock);
-               atomic_inc(&cur_trans->use_count);
+               refcount_inc(&cur_trans->use_count);
                ret = btrfs_end_transaction(trans);
 
                wait_for_commit(cur_trans);
@@ -2035,7 +2033,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
                prev_trans = list_entry(cur_trans->list.prev,
                                        struct btrfs_transaction, list);
                if (prev_trans->state != TRANS_STATE_COMPLETED) {
-                       atomic_inc(&prev_trans->use_count);
+                       refcount_inc(&prev_trans->use_count);
                        spin_unlock(&fs_info->trans_lock);
 
                        wait_for_commit(prev_trans);
@@ -2130,13 +2128,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
                goto scrub_continue;
        }
 
-       /* Reocrd old roots for later qgroup accounting */
-       ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
-       if (ret) {
-               mutex_unlock(&fs_info->reloc_mutex);
-               goto scrub_continue;
-       }
-
        /*
         * make sure none of the code above managed to slip in a
         * delayed item
@@ -2178,6 +2169,24 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
         */
        btrfs_free_log_root_tree(trans, fs_info);
 
+       /*
+        * commit_fs_roots() can call btrfs_save_ino_cache(), which generates
+        * new delayed refs. Must handle them or qgroup can be wrong.
+        */
+       ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+       if (ret) {
+               mutex_unlock(&fs_info->tree_log_mutex);
+               mutex_unlock(&fs_info->reloc_mutex);
+               goto scrub_continue;
+       }
+
+       ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
+       if (ret) {
+               mutex_unlock(&fs_info->tree_log_mutex);
+               mutex_unlock(&fs_info->reloc_mutex);
+               goto scrub_continue;
+       }
+
        /*
         * Since fs roots are all committed, we can get a quite accurate
         * new_roots. So let's do quota accounting.
@@ -2223,7 +2232,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 
        switch_commit_roots(cur_trans, fs_info);
 
-       assert_qgroups_uptodate(trans);
        ASSERT(list_empty(&cur_trans->dirty_bgs));
        ASSERT(list_empty(&cur_trans->io_bgs));
        update_super_roots(fs_info);
index 5dfb5590fff654a077fd95704682293bfc948a8f..c55e44560103b48e2a917701899c611732d8a013 100644 (file)
@@ -18,6 +18,8 @@
 
 #ifndef __BTRFS_TRANSACTION__
 #define __BTRFS_TRANSACTION__
+
+#include <linux/refcount.h>
 #include "btrfs_inode.h"
 #include "delayed-ref.h"
 #include "ctree.h"
@@ -49,7 +51,7 @@ struct btrfs_transaction {
         * transaction can end
         */
        atomic_t num_writers;
-       atomic_t use_count;
+       refcount_t use_count;
        atomic_t pending_ordered;
 
        unsigned long flags;
@@ -125,8 +127,6 @@ struct btrfs_trans_handle {
        unsigned int type;
        struct btrfs_root *root;
        struct btrfs_fs_info *fs_info;
-       struct seq_list delayed_ref_elem;
-       struct list_head qgroup_ref_list;
        struct list_head new_bgs;
 };
 
index a59674c3e69efb76d27d6705b41ca76d94e82e15..ccfe9fe7754a8d4d80fd3e5b1f0a1d2f2118e4e6 100644 (file)
@@ -4196,7 +4196,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                if (em->generation <= test_gen)
                        continue;
                /* Need a ref to keep it from getting evicted from cache */
-               atomic_inc(&em->refs);
+               refcount_inc(&em->refs);
                set_bit(EXTENT_FLAG_LOGGING, &em->flags);
                list_add_tail(&em->list, &extents);
                num++;
index ab8a66d852f91cb04206361a551b8c57760c9c40..017b67daa3bbf375919019e5c089b94f97d3e2a5 100644 (file)
@@ -139,6 +139,11 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
+                            enum btrfs_map_op op,
+                            u64 logical, u64 *length,
+                            struct btrfs_bio **bbio_ret,
+                            int mirror_num, int need_raid_map);
 
 DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
@@ -1008,14 +1013,13 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                q = bdev_get_queue(bdev);
                if (blk_queue_discard(q))
                        device->can_discard = 1;
+               if (!blk_queue_nonrot(q))
+                       fs_devices->rotating = 1;
 
                device->bdev = bdev;
                device->in_fs_metadata = 0;
                device->mode = flags;
 
-               if (!blk_queue_nonrot(bdev_get_queue(bdev)))
-                       fs_devices->rotating = 1;
-
                fs_devices->open_devices++;
                if (device->writeable &&
                    device->devid != BTRFS_DEV_REPLACE_DEVID) {
@@ -2417,7 +2421,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
        fs_info->free_chunk_space += device->total_bytes;
        spin_unlock(&fs_info->free_chunk_lock);
 
-       if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+       if (!blk_queue_nonrot(q))
                fs_info->fs_devices->rotating = 1;
 
        tmp = btrfs_super_total_bytes(fs_info->super_copy);
@@ -2795,10 +2799,38 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info,
        return ret;
 }
 
+static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
+                                       u64 logical, u64 length)
+{
+       struct extent_map_tree *em_tree;
+       struct extent_map *em;
+
+       em_tree = &fs_info->mapping_tree.map_tree;
+       read_lock(&em_tree->lock);
+       em = lookup_extent_mapping(em_tree, logical, length);
+       read_unlock(&em_tree->lock);
+
+       if (!em) {
+               btrfs_crit(fs_info, "unable to find logical %llu length %llu",
+                          logical, length);
+               return ERR_PTR(-EINVAL);
+       }
+
+       if (em->start > logical || em->start + em->len < logical) {
+               btrfs_crit(fs_info,
+                          "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
+                          logical, length, em->start, em->start + em->len);
+               free_extent_map(em);
+               return ERR_PTR(-EINVAL);
+       }
+
+       /* callers are responsible for dropping em's ref. */
+       return em;
+}
+
 int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
                       struct btrfs_fs_info *fs_info, u64 chunk_offset)
 {
-       struct extent_map_tree *em_tree;
        struct extent_map *em;
        struct map_lookup *map;
        u64 dev_extent_len = 0;
@@ -2806,23 +2838,15 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
        int i, ret = 0;
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 
-       em_tree = &fs_info->mapping_tree.map_tree;
-
-       read_lock(&em_tree->lock);
-       em = lookup_extent_mapping(em_tree, chunk_offset, 1);
-       read_unlock(&em_tree->lock);
-
-       if (!em || em->start > chunk_offset ||
-           em->start + em->len < chunk_offset) {
+       em = get_chunk_map(fs_info, chunk_offset, 1);
+       if (IS_ERR(em)) {
                /*
                 * This is a logic error, but we don't want to just rely on the
                 * user having built with ASSERT enabled, so if ASSERT doesn't
                 * do anything we still error out.
                 */
                ASSERT(0);
-               if (em)
-                       free_extent_map(em);
-               return -EINVAL;
+               return PTR_ERR(em);
        }
        map = em->map_lookup;
        mutex_lock(&fs_info->chunk_mutex);
@@ -3736,7 +3760,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
        if (ret)
                btrfs_handle_fs_error(fs_info, ret, NULL);
 
-       atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+       clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
 }
 
 /* Non-zero return value signifies invalidity */
@@ -3755,6 +3779,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                  struct btrfs_ioctl_balance_args *bargs)
 {
        struct btrfs_fs_info *fs_info = bctl->fs_info;
+       u64 meta_target, data_target;
        u64 allowed;
        int mixed = 0;
        int ret;
@@ -3851,11 +3876,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                }
        } while (read_seqretry(&fs_info->profiles_lock, seq));
 
-       if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) <
-               btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) {
+       /* if we're not converting, the target field is uninitialized */
+       meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+               bctl->meta.target : fs_info->avail_metadata_alloc_bits;
+       data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+               bctl->data.target : fs_info->avail_data_alloc_bits;
+       if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
+               btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
                btrfs_warn(fs_info,
                           "metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
-                          bctl->meta.target, bctl->data.target);
+                          meta_target, data_target);
        }
 
        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
@@ -3910,7 +3940,7 @@ out:
                __cancel_balance(fs_info);
        else {
                kfree(bctl);
-               atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+               clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
        }
        return ret;
 }
@@ -4000,7 +4030,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
        btrfs_balance_sys(leaf, item, &disk_bargs);
        btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
 
-       WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
+       WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
 
        mutex_lock(&fs_info->volume_mutex);
        mutex_lock(&fs_info->balance_mutex);
@@ -4785,7 +4815,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        stripe_size = div_u64(stripe_size, dev_stripes);
 
        /* align to BTRFS_STRIPE_LEN */
-       stripe_size = div_u64(stripe_size, raid_stripe_len);
+       stripe_size = div64_u64(stripe_size, raid_stripe_len);
        stripe_size *= raid_stripe_len;
 
        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -4833,7 +4863,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        ret = add_extent_mapping(em_tree, em, 0);
        if (!ret) {
                list_add_tail(&em->list, &trans->transaction->pending_chunks);
-               atomic_inc(&em->refs);
+               refcount_inc(&em->refs);
        }
        write_unlock(&em_tree->lock);
        if (ret) {
@@ -4888,7 +4918,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
        struct btrfs_device *device;
        struct btrfs_chunk *chunk;
        struct btrfs_stripe *stripe;
-       struct extent_map_tree *em_tree;
        struct extent_map *em;
        struct map_lookup *map;
        size_t item_size;
@@ -4897,24 +4926,9 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
        int i = 0;
        int ret = 0;
 
-       em_tree = &fs_info->mapping_tree.map_tree;
-       read_lock(&em_tree->lock);
-       em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
-       read_unlock(&em_tree->lock);
-
-       if (!em) {
-               btrfs_crit(fs_info, "unable to find logical %Lu len %Lu",
-                          chunk_offset, chunk_size);
-               return -EINVAL;
-       }
-
-       if (em->start != chunk_offset || em->len != chunk_size) {
-               btrfs_crit(fs_info,
-                          "found a bad mapping, wanted %Lu-%Lu, found %Lu-%Lu",
-                           chunk_offset, chunk_size, em->start, em->len);
-               free_extent_map(em);
-               return -EINVAL;
-       }
+       em = get_chunk_map(fs_info, chunk_offset, chunk_size);
+       if (IS_ERR(em))
+               return PTR_ERR(em);
 
        map = em->map_lookup;
        item_size = btrfs_chunk_item_size(map->num_stripes);
@@ -5055,15 +5069,12 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
 {
        struct extent_map *em;
        struct map_lookup *map;
-       struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
        int readonly = 0;
        int miss_ndevs = 0;
        int i;
 
-       read_lock(&map_tree->map_tree.lock);
-       em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
-       read_unlock(&map_tree->map_tree.lock);
-       if (!em)
+       em = get_chunk_map(fs_info, chunk_offset, 1);
+       if (IS_ERR(em))
                return 1;
 
        map = em->map_lookup;
@@ -5117,34 +5128,19 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 
 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 {
-       struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
        struct extent_map *em;
        struct map_lookup *map;
-       struct extent_map_tree *em_tree = &map_tree->map_tree;
        int ret;
 
-       read_lock(&em_tree->lock);
-       em = lookup_extent_mapping(em_tree, logical, len);
-       read_unlock(&em_tree->lock);
-
-       /*
-        * We could return errors for these cases, but that could get ugly and
-        * we'd probably do the same thing which is just not do anything else
-        * and exit, so return 1 so the callers don't try to use other copies.
-        */
-       if (!em) {
-               btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
-                           logical+len);
-               return 1;
-       }
-
-       if (em->start > logical || em->start + em->len < logical) {
-               btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got %Lu-%Lu",
-                          logical, logical+len, em->start,
-                          em->start + em->len);
-               free_extent_map(em);
+       em = get_chunk_map(fs_info, logical, len);
+       if (IS_ERR(em))
+               /*
+                * We could return errors for these cases, but that could get
+                * ugly and we'd probably do the same thing which is just not do
+                * anything else and exit, so return 1 so the callers don't try
+                * to use other copies.
+                */
                return 1;
-       }
 
        map = em->map_lookup;
        if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
@@ -5160,7 +5156,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
        free_extent_map(em);
 
        btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
-       if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
+       if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
+           fs_info->dev_replace.tgtdev)
                ret++;
        btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
 
@@ -5173,15 +5170,11 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
 {
        struct extent_map *em;
        struct map_lookup *map;
-       struct extent_map_tree *em_tree = &map_tree->map_tree;
        unsigned long len = fs_info->sectorsize;
 
-       read_lock(&em_tree->lock);
-       em = lookup_extent_mapping(em_tree, logical, len);
-       read_unlock(&em_tree->lock);
-       BUG_ON(!em);
+       em = get_chunk_map(fs_info, logical, len);
+       WARN_ON(IS_ERR(em));
 
-       BUG_ON(em->start > logical || em->start + em->len < logical);
        map = em->map_lookup;
        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
                len = map->stripe_len * nr_data_stripes(map);
@@ -5189,20 +5182,16 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
        return len;
 }
 
-int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
                           u64 logical, u64 len, int mirror_num)
 {
        struct extent_map *em;
        struct map_lookup *map;
-       struct extent_map_tree *em_tree = &map_tree->map_tree;
        int ret = 0;
 
-       read_lock(&em_tree->lock);
-       em = lookup_extent_mapping(em_tree, logical, len);
-       read_unlock(&em_tree->lock);
-       BUG_ON(!em);
+       em = get_chunk_map(fs_info, logical, len);
+       WARN_ON(IS_ERR(em));
 
-       BUG_ON(em->start > logical || em->start + em->len < logical);
        map = em->map_lookup;
        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
                ret = 1;
@@ -5295,25 +5284,353 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
                GFP_NOFS|__GFP_NOFAIL);
 
        atomic_set(&bbio->error, 0);
-       atomic_set(&bbio->refs, 1);
+       refcount_set(&bbio->refs, 1);
 
        return bbio;
 }
 
 void btrfs_get_bbio(struct btrfs_bio *bbio)
 {
-       WARN_ON(!atomic_read(&bbio->refs));
-       atomic_inc(&bbio->refs);
+       WARN_ON(!refcount_read(&bbio->refs));
+       refcount_inc(&bbio->refs);
 }
 
 void btrfs_put_bbio(struct btrfs_bio *bbio)
 {
        if (!bbio)
                return;
-       if (atomic_dec_and_test(&bbio->refs))
+       if (refcount_dec_and_test(&bbio->refs))
                kfree(bbio);
 }
 
+/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
+/*
+ * Please note that, discard won't be sent to target device of device
+ * replace.
+ */
+static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
+                                        u64 logical, u64 length,
+                                        struct btrfs_bio **bbio_ret)
+{
+       struct extent_map *em;
+       struct map_lookup *map;
+       struct btrfs_bio *bbio;
+       u64 offset;
+       u64 stripe_nr;
+       u64 stripe_nr_end;
+       u64 stripe_end_offset;
+       u64 stripe_cnt;
+       u64 stripe_len;
+       u64 stripe_offset;
+       u64 num_stripes;
+       u32 stripe_index;
+       u32 factor = 0;
+       u32 sub_stripes = 0;
+       u64 stripes_per_dev = 0;
+       u32 remaining_stripes = 0;
+       u32 last_stripe = 0;
+       int ret = 0;
+       int i;
+
+       /* discard always return a bbio */
+       ASSERT(bbio_ret);
+
+       em = get_chunk_map(fs_info, logical, length);
+       if (IS_ERR(em))
+               return PTR_ERR(em);
+
+       map = em->map_lookup;
+       /* we don't discard raid56 yet */
+       if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+               ret = -EOPNOTSUPP;
+               goto out;
+       }
+
+       offset = logical - em->start;
+       length = min_t(u64, em->len - offset, length);
+
+       stripe_len = map->stripe_len;
+       /*
+        * stripe_nr counts the total number of stripes we have to stride
+        * to get to this block
+        */
+       stripe_nr = div64_u64(offset, stripe_len);
+
+       /* stripe_offset is the offset of this block in its stripe */
+       stripe_offset = offset - stripe_nr * stripe_len;
+
+       stripe_nr_end = round_up(offset + length, map->stripe_len);
+       stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
+       stripe_cnt = stripe_nr_end - stripe_nr;
+       stripe_end_offset = stripe_nr_end * map->stripe_len -
+                           (offset + length);
+       /*
+        * after this, stripe_nr is the number of stripes on this
+        * device we have to walk to find the data, and stripe_index is
+        * the number of our device in the stripe array
+        */
+       num_stripes = 1;
+       stripe_index = 0;
+       if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+                        BTRFS_BLOCK_GROUP_RAID10)) {
+               if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+                       sub_stripes = 1;
+               else
+                       sub_stripes = map->sub_stripes;
+
+               factor = map->num_stripes / sub_stripes;
+               num_stripes = min_t(u64, map->num_stripes,
+                                   sub_stripes * stripe_cnt);
+               stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+               stripe_index *= sub_stripes;
+               stripes_per_dev = div_u64_rem(stripe_cnt, factor,
+                                             &remaining_stripes);
+               div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
+               last_stripe *= sub_stripes;
+       } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+                               BTRFS_BLOCK_GROUP_DUP)) {
+               num_stripes = map->num_stripes;
+       } else {
+               stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+                                       &stripe_index);
+       }
+
+       bbio = alloc_btrfs_bio(num_stripes, 0);
+       if (!bbio) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < num_stripes; i++) {
+               bbio->stripes[i].physical =
+                       map->stripes[stripe_index].physical +
+                       stripe_offset + stripe_nr * map->stripe_len;
+               bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+
+               if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+                                BTRFS_BLOCK_GROUP_RAID10)) {
+                       bbio->stripes[i].length = stripes_per_dev *
+                               map->stripe_len;
+
+                       if (i / sub_stripes < remaining_stripes)
+                               bbio->stripes[i].length +=
+                                       map->stripe_len;
+
+                       /*
+                        * Special for the first stripe and
+                        * the last stripe:
+                        *
+                        * |-------|...|-------|
+                        *     |----------|
+                        *    off     end_off
+                        */
+                       if (i < sub_stripes)
+                               bbio->stripes[i].length -=
+                                       stripe_offset;
+
+                       if (stripe_index >= last_stripe &&
+                           stripe_index <= (last_stripe +
+                                            sub_stripes - 1))
+                               bbio->stripes[i].length -=
+                                       stripe_end_offset;
+
+                       if (i == sub_stripes - 1)
+                               stripe_offset = 0;
+               } else {
+                       bbio->stripes[i].length = length;
+               }
+
+               stripe_index++;
+               if (stripe_index == map->num_stripes) {
+                       stripe_index = 0;
+                       stripe_nr++;
+               }
+       }
+
+       *bbio_ret = bbio;
+       bbio->map_type = map->type;
+       bbio->num_stripes = num_stripes;
+out:
+       free_extent_map(em);
+       return ret;
+}
+
+/*
+ * In dev-replace case, for repair case (that's the only case where the mirror
+ * is selected explicitly when calling btrfs_map_block), blocks left of the
+ * left cursor can also be read from the target drive.
+ *
+ * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
+ * array of stripes.
+ * For READ, it also needs to be supported using the same mirror number.
+ *
+ * If the requested block is not left of the left cursor, EIO is returned. This
+ * can happen because btrfs_num_copies() returns one more in the dev-replace
+ * case.
+ */
+static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
+                                        u64 logical, u64 length,
+                                        u64 srcdev_devid, int *mirror_num,
+                                        u64 *physical)
+{
+       struct btrfs_bio *bbio = NULL;
+       int num_stripes;
+       int index_srcdev = 0;
+       int found = 0;
+       u64 physical_of_found = 0;
+       int i;
+       int ret = 0;
+
+       ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+                               logical, &length, &bbio, 0, 0);
+       if (ret) {
+               ASSERT(bbio == NULL);
+               return ret;
+       }
+
+       num_stripes = bbio->num_stripes;
+       if (*mirror_num > num_stripes) {
+               /*
+                * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
+                * that means that the requested area is not left of the left
+                * cursor
+                */
+               btrfs_put_bbio(bbio);
+               return -EIO;
+       }
+
+       /*
+        * process the rest of the function using the mirror_num of the source
+        * drive. Therefore look it up first.  At the end, patch the device
+        * pointer to the one of the target drive.
+        */
+       for (i = 0; i < num_stripes; i++) {
+               if (bbio->stripes[i].dev->devid != srcdev_devid)
+                       continue;
+
+               /*
+                * In case of DUP, in order to keep it simple, only add the
+                * mirror with the lowest physical address
+                */
+               if (found &&
+                   physical_of_found <= bbio->stripes[i].physical)
+                       continue;
+
+               index_srcdev = i;
+               found = 1;
+               physical_of_found = bbio->stripes[i].physical;
+       }
+
+       btrfs_put_bbio(bbio);
+
+       ASSERT(found);
+       if (!found)
+               return -EIO;
+
+       *mirror_num = index_srcdev + 1;
+       *physical = physical_of_found;
+       return ret;
+}
+
+static void handle_ops_on_dev_replace(enum btrfs_map_op op,
+                                     struct btrfs_bio **bbio_ret,
+                                     struct btrfs_dev_replace *dev_replace,
+                                     int *num_stripes_ret, int *max_errors_ret)
+{
+       struct btrfs_bio *bbio = *bbio_ret;
+       u64 srcdev_devid = dev_replace->srcdev->devid;
+       int tgtdev_indexes = 0;
+       int num_stripes = *num_stripes_ret;
+       int max_errors = *max_errors_ret;
+       int i;
+
+       if (op == BTRFS_MAP_WRITE) {
+               int index_where_to_add;
+
+               /*
+                * duplicate the write operations while the dev replace
+                * procedure is running. Since the copying of the old disk to
+                * the new disk takes place at run time while the filesystem is
+                * mounted writable, the regular write operations to the old
+                * disk have to be duplicated to go to the new disk as well.
+                *
+                * Note that device->missing is handled by the caller, and that
+                * the write to the old disk is already set up in the stripes
+                * array.
+                */
+               index_where_to_add = num_stripes;
+               for (i = 0; i < num_stripes; i++) {
+                       if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                               /* write to new disk, too */
+                               struct btrfs_bio_stripe *new =
+                                       bbio->stripes + index_where_to_add;
+                               struct btrfs_bio_stripe *old =
+                                       bbio->stripes + i;
+
+                               new->physical = old->physical;
+                               new->length = old->length;
+                               new->dev = dev_replace->tgtdev;
+                               bbio->tgtdev_map[i] = index_where_to_add;
+                               index_where_to_add++;
+                               max_errors++;
+                               tgtdev_indexes++;
+                       }
+               }
+               num_stripes = index_where_to_add;
+       } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
+               int index_srcdev = 0;
+               int found = 0;
+               u64 physical_of_found = 0;
+
+               /*
+                * During the dev-replace procedure, the target drive can also
+                * be used to read data in case it is needed to repair a corrupt
+                * block elsewhere. This is possible if the requested area is
+                * left of the left cursor. In this area, the target drive is a
+                * full copy of the source drive.
+                */
+               for (i = 0; i < num_stripes; i++) {
+                       if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                               /*
+                                * In case of DUP, in order to keep it simple,
+                                * only add the mirror with the lowest physical
+                                * address
+                                */
+                               if (found &&
+                                   physical_of_found <=
+                                    bbio->stripes[i].physical)
+                                       continue;
+                               index_srcdev = i;
+                               found = 1;
+                               physical_of_found = bbio->stripes[i].physical;
+                       }
+               }
+               if (found) {
+                       struct btrfs_bio_stripe *tgtdev_stripe =
+                               bbio->stripes + num_stripes;
+
+                       tgtdev_stripe->physical = physical_of_found;
+                       tgtdev_stripe->length =
+                               bbio->stripes[index_srcdev].length;
+                       tgtdev_stripe->dev = dev_replace->tgtdev;
+                       bbio->tgtdev_map[index_srcdev] = num_stripes;
+
+                       tgtdev_indexes++;
+                       num_stripes++;
+               }
+       }
+
+       *num_stripes_ret = num_stripes;
+       *max_errors_ret = max_errors;
+       bbio->num_tgtdevs = tgtdev_indexes;
+       *bbio_ret = bbio;
+}
+
+static bool need_full_stripe(enum btrfs_map_op op)
+{
+       return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
+}
+
 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                             enum btrfs_map_op op,
                             u64 logical, u64 *length,
@@ -5322,14 +5639,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 {
        struct extent_map *em;
        struct map_lookup *map;
-       struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
-       struct extent_map_tree *em_tree = &map_tree->map_tree;
        u64 offset;
        u64 stripe_offset;
-       u64 stripe_end_offset;
        u64 stripe_nr;
-       u64 stripe_nr_orig;
-       u64 stripe_nr_end;
        u64 stripe_len;
        u32 stripe_index;
        int i;
@@ -5345,23 +5657,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
        u64 physical_to_patch_in_first_stripe = 0;
        u64 raid56_full_stripe_start = (u64)-1;
 
-       read_lock(&em_tree->lock);
-       em = lookup_extent_mapping(em_tree, logical, *length);
-       read_unlock(&em_tree->lock);
-
-       if (!em) {
-               btrfs_crit(fs_info, "unable to find logical %llu len %llu",
-                       logical, *length);
-               return -EINVAL;
-       }
+       if (op == BTRFS_MAP_DISCARD)
+               return __btrfs_map_block_for_discard(fs_info, logical,
+                                                    *length, bbio_ret);
 
-       if (em->start > logical || em->start + em->len < logical) {
-               btrfs_crit(fs_info,
-                          "found a bad mapping, wanted %Lu, found %Lu-%Lu",
-                          logical, em->start, em->start + em->len);
-               free_extent_map(em);
-               return -EINVAL;
-       }
+       em = get_chunk_map(fs_info, logical, *length);
+       if (IS_ERR(em))
+               return PTR_ERR(em);
 
        map = em->map_lookup;
        offset = logical - em->start;
@@ -5400,14 +5702,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                raid56_full_stripe_start *= full_stripe_len;
        }
 
-       if (op == BTRFS_MAP_DISCARD) {
-               /* we don't discard raid56 yet */
-               if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-                       ret = -EOPNOTSUPP;
-                       goto out;
-               }
-               *length = min_t(u64, em->len - offset, *length);
-       } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+       if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
                u64 max_len;
                /* For writes to RAID[56], allow a full stripeset across all disks.
                   For other RAID types and for RAID[56] reads, just allow a single
@@ -5438,105 +5733,28 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                btrfs_dev_replace_set_lock_blocking(dev_replace);
 
        if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
-           op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
-           op != BTRFS_MAP_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) {
-               /*
-                * in dev-replace case, for repair case (that's the only
-                * case where the mirror is selected explicitly when
-                * calling btrfs_map_block), blocks left of the left cursor
-                * can also be read from the target drive.
-                * For REQ_GET_READ_MIRRORS, the target drive is added as
-                * the last one to the array of stripes. For READ, it also
-                * needs to be supported using the same mirror number.
-                * If the requested block is not left of the left cursor,
-                * EIO is returned. This can happen because btrfs_num_copies()
-                * returns one more in the dev-replace case.
-                */
-               u64 tmp_length = *length;
-               struct btrfs_bio *tmp_bbio = NULL;
-               int tmp_num_stripes;
-               u64 srcdev_devid = dev_replace->srcdev->devid;
-               int index_srcdev = 0;
-               int found = 0;
-               u64 physical_of_found = 0;
-
-               ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
-                            logical, &tmp_length, &tmp_bbio, 0, 0);
-               if (ret) {
-                       WARN_ON(tmp_bbio != NULL);
-                       goto out;
-               }
-
-               tmp_num_stripes = tmp_bbio->num_stripes;
-               if (mirror_num > tmp_num_stripes) {
-                       /*
-                        * BTRFS_MAP_GET_READ_MIRRORS does not contain this
-                        * mirror, that means that the requested area
-                        * is not left of the left cursor
-                        */
-                       ret = -EIO;
-                       btrfs_put_bbio(tmp_bbio);
-                       goto out;
-               }
-
-               /*
-                * process the rest of the function using the mirror_num
-                * of the source drive. Therefore look it up first.
-                * At the end, patch the device pointer to the one of the
-                * target drive.
-                */
-               for (i = 0; i < tmp_num_stripes; i++) {
-                       if (tmp_bbio->stripes[i].dev->devid != srcdev_devid)
-                               continue;
-
-                       /*
-                        * In case of DUP, in order to keep it simple, only add
-                        * the mirror with the lowest physical address
-                        */
-                       if (found &&
-                           physical_of_found <= tmp_bbio->stripes[i].physical)
-                               continue;
-
-                       index_srcdev = i;
-                       found = 1;
-                       physical_of_found = tmp_bbio->stripes[i].physical;
-               }
-
-               btrfs_put_bbio(tmp_bbio);
-
-               if (!found) {
-                       WARN_ON(1);
-                       ret = -EIO;
+           !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
+               ret = get_extra_mirror_from_replace(fs_info, logical, *length,
+                                                   dev_replace->srcdev->devid,
+                                                   &mirror_num,
+                                           &physical_to_patch_in_first_stripe);
+               if (ret)
                        goto out;
-               }
-
-               mirror_num = index_srcdev + 1;
-               patch_the_first_stripe_for_dev_replace = 1;
-               physical_to_patch_in_first_stripe = physical_of_found;
+               else
+                       patch_the_first_stripe_for_dev_replace = 1;
        } else if (mirror_num > map->num_stripes) {
                mirror_num = 0;
        }
 
        num_stripes = 1;
        stripe_index = 0;
-       stripe_nr_orig = stripe_nr;
-       stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
-       stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len);
-       stripe_end_offset = stripe_nr_end * map->stripe_len -
-                           (offset + *length);
-
        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
-               if (op == BTRFS_MAP_DISCARD)
-                       num_stripes = min_t(u64, map->num_stripes,
-                                           stripe_nr_end - stripe_nr_orig);
                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
                                &stripe_index);
-               if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
-                   op != BTRFS_MAP_GET_READ_MIRRORS)
+               if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS)
                        mirror_num = 1;
        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-               if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD ||
-                   op == BTRFS_MAP_GET_READ_MIRRORS)
+               if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
                        num_stripes = map->num_stripes;
                else if (mirror_num)
                        stripe_index = mirror_num - 1;
@@ -5549,8 +5767,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                }
 
        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-               if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD ||
-                   op == BTRFS_MAP_GET_READ_MIRRORS) {
+               if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) {
                        num_stripes = map->num_stripes;
                } else if (mirror_num) {
                        stripe_index = mirror_num - 1;
@@ -5566,10 +5783,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 
                if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
                        num_stripes = map->sub_stripes;
-               else if (op == BTRFS_MAP_DISCARD)
-                       num_stripes = min_t(u64, map->sub_stripes *
-                                           (stripe_nr_end - stripe_nr_orig),
-                                           map->num_stripes);
                else if (mirror_num)
                        stripe_index += mirror_num - 1;
                else {
@@ -5587,7 +5800,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                    (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS ||
                     mirror_num > 1)) {
                        /* push stripe_nr back to the start of the full stripe */
-                       stripe_nr = div_u64(raid56_full_stripe_start,
+                       stripe_nr = div64_u64(raid56_full_stripe_start,
                                        stripe_len * nr_data_stripes(map));
 
                        /* RAID[56] write or recovery. Return all stripes */
@@ -5612,8 +5825,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                        /* We distribute the parity blocks across stripes */
                        div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
                                        &stripe_index);
-                       if ((op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
-                           op != BTRFS_MAP_GET_READ_MIRRORS) && mirror_num <= 1)
+                       if ((op != BTRFS_MAP_WRITE &&
+                            op != BTRFS_MAP_GET_READ_MIRRORS) &&
+                           mirror_num <= 1)
                                mirror_num = 1;
                }
        } else {
@@ -5635,8 +5849,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
        }
 
        num_alloc_stripes = num_stripes;
-       if (dev_replace_is_ongoing) {
-               if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD)
+       if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
+               if (op == BTRFS_MAP_WRITE)
                        num_alloc_stripes <<= 1;
                if (op == BTRFS_MAP_GET_READ_MIRRORS)
                        num_alloc_stripes++;
@@ -5648,14 +5862,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                ret = -ENOMEM;
                goto out;
        }
-       if (dev_replace_is_ongoing)
+       if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
                bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
 
        /* build raid_map */
-       if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
-           need_raid_map &&
-           ((op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) ||
-           mirror_num > 1)) {
+       if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
+           (need_full_stripe(op) || mirror_num > 1)) {
                u64 tmp;
                unsigned rot;
 
@@ -5679,173 +5891,27 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                                RAID6_Q_STRIPE;
        }
 
-       if (op == BTRFS_MAP_DISCARD) {
-               u32 factor = 0;
-               u32 sub_stripes = 0;
-               u64 stripes_per_dev = 0;
-               u32 remaining_stripes = 0;
-               u32 last_stripe = 0;
 
-               if (map->type &
-                   (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
-                       if (map->type & BTRFS_BLOCK_GROUP_RAID0)
-                               sub_stripes = 1;
-                       else
-                               sub_stripes = map->sub_stripes;
-
-                       factor = map->num_stripes / sub_stripes;
-                       stripes_per_dev = div_u64_rem(stripe_nr_end -
-                                                     stripe_nr_orig,
-                                                     factor,
-                                                     &remaining_stripes);
-                       div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
-                       last_stripe *= sub_stripes;
-               }
-
-               for (i = 0; i < num_stripes; i++) {
-                       bbio->stripes[i].physical =
-                               map->stripes[stripe_index].physical +
-                               stripe_offset + stripe_nr * map->stripe_len;
-                       bbio->stripes[i].dev = map->stripes[stripe_index].dev;
-
-                       if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-                                        BTRFS_BLOCK_GROUP_RAID10)) {
-                               bbio->stripes[i].length = stripes_per_dev *
-                                                         map->stripe_len;
-
-                               if (i / sub_stripes < remaining_stripes)
-                                       bbio->stripes[i].length +=
-                                               map->stripe_len;
-
-                               /*
-                                * Special for the first stripe and
-                                * the last stripe:
-                                *
-                                * |-------|...|-------|
-                                *     |----------|
-                                *    off     end_off
-                                */
-                               if (i < sub_stripes)
-                                       bbio->stripes[i].length -=
-                                               stripe_offset;
-
-                               if (stripe_index >= last_stripe &&
-                                   stripe_index <= (last_stripe +
-                                                    sub_stripes - 1))
-                                       bbio->stripes[i].length -=
-                                               stripe_end_offset;
-
-                               if (i == sub_stripes - 1)
-                                       stripe_offset = 0;
-                       } else
-                               bbio->stripes[i].length = *length;
-
-                       stripe_index++;
-                       if (stripe_index == map->num_stripes) {
-                               /* This could only happen for RAID0/10 */
-                               stripe_index = 0;
-                               stripe_nr++;
-                       }
-               }
-       } else {
-               for (i = 0; i < num_stripes; i++) {
-                       bbio->stripes[i].physical =
-                               map->stripes[stripe_index].physical +
-                               stripe_offset +
-                               stripe_nr * map->stripe_len;
-                       bbio->stripes[i].dev =
-                               map->stripes[stripe_index].dev;
-                       stripe_index++;
-               }
+       for (i = 0; i < num_stripes; i++) {
+               bbio->stripes[i].physical =
+                       map->stripes[stripe_index].physical +
+                       stripe_offset +
+                       stripe_nr * map->stripe_len;
+               bbio->stripes[i].dev =
+                       map->stripes[stripe_index].dev;
+               stripe_index++;
        }
 
-       if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
+       if (need_full_stripe(op))
                max_errors = btrfs_chunk_max_errors(map);
 
        if (bbio->raid_map)
                sort_parity_stripes(bbio, num_stripes);
 
-       tgtdev_indexes = 0;
-       if (dev_replace_is_ongoing &&
-          (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) &&
-           dev_replace->tgtdev != NULL) {
-               int index_where_to_add;
-               u64 srcdev_devid = dev_replace->srcdev->devid;
-
-               /*
-                * duplicate the write operations while the dev replace
-                * procedure is running. Since the copying of the old disk
-                * to the new disk takes place at run time while the
-                * filesystem is mounted writable, the regular write
-                * operations to the old disk have to be duplicated to go
-                * to the new disk as well.
-                * Note that device->missing is handled by the caller, and
-                * that the write to the old disk is already set up in the
-                * stripes array.
-                */
-               index_where_to_add = num_stripes;
-               for (i = 0; i < num_stripes; i++) {
-                       if (bbio->stripes[i].dev->devid == srcdev_devid) {
-                               /* write to new disk, too */
-                               struct btrfs_bio_stripe *new =
-                                       bbio->stripes + index_where_to_add;
-                               struct btrfs_bio_stripe *old =
-                                       bbio->stripes + i;
-
-                               new->physical = old->physical;
-                               new->length = old->length;
-                               new->dev = dev_replace->tgtdev;
-                               bbio->tgtdev_map[i] = index_where_to_add;
-                               index_where_to_add++;
-                               max_errors++;
-                               tgtdev_indexes++;
-                       }
-               }
-               num_stripes = index_where_to_add;
-       } else if (dev_replace_is_ongoing &&
-                  op == BTRFS_MAP_GET_READ_MIRRORS &&
-                  dev_replace->tgtdev != NULL) {
-               u64 srcdev_devid = dev_replace->srcdev->devid;
-               int index_srcdev = 0;
-               int found = 0;
-               u64 physical_of_found = 0;
-
-               /*
-                * During the dev-replace procedure, the target drive can
-                * also be used to read data in case it is needed to repair
-                * a corrupt block elsewhere. This is possible if the
-                * requested area is left of the left cursor. In this area,
-                * the target drive is a full copy of the source drive.
-                */
-               for (i = 0; i < num_stripes; i++) {
-                       if (bbio->stripes[i].dev->devid == srcdev_devid) {
-                               /*
-                                * In case of DUP, in order to keep it
-                                * simple, only add the mirror with the
-                                * lowest physical address
-                                */
-                               if (found &&
-                                   physical_of_found <=
-                                    bbio->stripes[i].physical)
-                                       continue;
-                               index_srcdev = i;
-                               found = 1;
-                               physical_of_found = bbio->stripes[i].physical;
-                       }
-               }
-               if (found) {
-                       struct btrfs_bio_stripe *tgtdev_stripe =
-                               bbio->stripes + num_stripes;
-
-                       tgtdev_stripe->physical = physical_of_found;
-                       tgtdev_stripe->length =
-                               bbio->stripes[index_srcdev].length;
-                       tgtdev_stripe->dev = dev_replace->tgtdev;
-                       bbio->tgtdev_map[index_srcdev] = num_stripes;
-
-                       tgtdev_indexes++;
-                       num_stripes++;
-               }
+       if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
+           need_full_stripe(op)) {
+               handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
+                                         &max_errors);
        }
 
        *bbio_ret = bbio;
@@ -5853,7 +5919,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
        bbio->num_stripes = num_stripes;
        bbio->max_errors = max_errors;
        bbio->mirror_num = mirror_num;
-       bbio->num_tgtdevs = tgtdev_indexes;
 
        /*
         * this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -5886,19 +5951,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 /* For Scrub/replace */
 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                     u64 logical, u64 *length,
-                    struct btrfs_bio **bbio_ret, int mirror_num,
-                    int need_raid_map)
+                    struct btrfs_bio **bbio_ret)
 {
-       return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
-                                mirror_num, need_raid_map);
+       return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
 }
 
 int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
                     u64 chunk_start, u64 physical, u64 devid,
                     u64 **logical, int *naddrs, int *stripe_len)
 {
-       struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
-       struct extent_map_tree *em_tree = &map_tree->map_tree;
        struct extent_map *em;
        struct map_lookup *map;
        u64 *buf;
@@ -5908,24 +5969,11 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
        u64 rmap_len;
        int i, j, nr = 0;
 
-       read_lock(&em_tree->lock);
-       em = lookup_extent_mapping(em_tree, chunk_start, 1);
-       read_unlock(&em_tree->lock);
-
-       if (!em) {
-               btrfs_err(fs_info, "couldn't find em for chunk %Lu",
-                       chunk_start);
+       em = get_chunk_map(fs_info, chunk_start, 1);
+       if (IS_ERR(em))
                return -EIO;
-       }
 
-       if (em->start != chunk_start) {
-               btrfs_err(fs_info, "bad chunk start, em=%Lu, wanted=%Lu",
-                      em->start, chunk_start);
-               free_extent_map(em);
-               return -EIO;
-       }
        map = em->map_lookup;
-
        length = em->len;
        rmap_len = map->stripe_len;
 
@@ -5949,7 +5997,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
                        continue;
 
                stripe_nr = physical - map->stripes[i].physical;
-               stripe_nr = div_u64(stripe_nr, map->stripe_len);
+               stripe_nr = div64_u64(stripe_nr, map->stripe_len);
 
                if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
                        stripe_nr = stripe_nr * map->num_stripes + i;
index 59be81206dd7b949683ad95c80fa561c022ad812..c7d0fbc915cabdee7cfec437e18f795ace77abbb 100644 (file)
@@ -123,7 +123,6 @@ struct btrfs_device {
        struct list_head resized_list;
 
        /* for sending down flush barriers */
-       int nobarriers;
        struct bio *flush_bio;
        struct completion flush_wait;
 
@@ -298,7 +297,7 @@ struct btrfs_bio;
 typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
 
 struct btrfs_bio {
-       atomic_t refs;
+       refcount_t refs;
        atomic_t stripes_pending;
        struct btrfs_fs_info *fs_info;
        u64 map_type; /* get from map_lookup->type */
@@ -400,8 +399,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                    struct btrfs_bio **bbio_ret, int mirror_num);
 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                     u64 logical, u64 *length,
-                    struct btrfs_bio **bbio_ret, int mirror_num,
-                    int need_raid_map);
+                    struct btrfs_bio **bbio_ret);
 int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
                     u64 chunk_start, u64 physical, u64 devid,
                     u64 **logical, int *naddrs, int *stripe_len);
@@ -475,7 +473,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
                                              struct btrfs_device *tgtdev);
 void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path);
-int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
                           u64 logical, u64 len, int mirror_num);
 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
                                    struct btrfs_mapping_tree *map_tree,
index a3c3cab643a9528dd5f8ad50dfac29b53384f1db..e37973526153a83c2d96169b77e507525c53cc53 100644 (file)
@@ -12,6 +12,7 @@ struct btrfs_root;
 struct btrfs_fs_info;
 struct btrfs_inode;
 struct extent_map;
+struct btrfs_file_extent_item;
 struct btrfs_ordered_extent;
 struct btrfs_delayed_ref_node;
 struct btrfs_delayed_tree_ref;
@@ -24,6 +25,7 @@ struct extent_buffer;
 struct btrfs_work;
 struct __btrfs_workqueue;
 struct btrfs_qgroup_extent_record;
+struct btrfs_qgroup;
 
 #define show_ref_type(type)                                            \
        __print_symbolic(type,                                          \
@@ -54,6 +56,12 @@ struct btrfs_qgroup_extent_record;
              (obj >= BTRFS_ROOT_TREE_OBJECTID &&                       \
               obj <= BTRFS_QUOTA_TREE_OBJECTID)) ? __show_root_type(obj) : "-"
 
+#define show_fi_type(type)                                             \
+       __print_symbolic(type,                                          \
+                { BTRFS_FILE_EXTENT_INLINE,    "INLINE" },             \
+                { BTRFS_FILE_EXTENT_REG,       "REG"    },             \
+                { BTRFS_FILE_EXTENT_PREALLOC,  "PREALLOC"})
+
 #define BTRFS_GROUP_FLAGS      \
        { BTRFS_BLOCK_GROUP_DATA,       "DATA"},        \
        { BTRFS_BLOCK_GROUP_SYSTEM,     "SYSTEM"},      \
@@ -213,7 +221,7 @@ TRACE_EVENT_CONDITION(btrfs_get_extent,
                __entry->block_start    = map->block_start;
                __entry->block_len      = map->block_len;
                __entry->flags          = map->flags;
-               __entry->refs           = atomic_read(&map->refs);
+               __entry->refs           = refcount_read(&map->refs);
                __entry->compress_type  = map->compress_type;
        ),
 
@@ -232,6 +240,138 @@ TRACE_EVENT_CONDITION(btrfs_get_extent,
                  __entry->refs, __entry->compress_type)
 );
 
+/* file extent item */
+DECLARE_EVENT_CLASS(btrfs__file_extent_item_regular,
+
+       TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l,
+                struct btrfs_file_extent_item *fi, u64 start),
+
+       TP_ARGS(bi, l, fi, start),
+
+       TP_STRUCT__entry_btrfs(
+               __field(        u64,    root_obj        )
+               __field(        u64,    ino             )
+               __field(        loff_t, isize           )
+               __field(        u64,    disk_isize      )
+               __field(        u64,    num_bytes       )
+               __field(        u64,    ram_bytes       )
+               __field(        u64,    disk_bytenr     )
+               __field(        u64,    disk_num_bytes  )
+               __field(        u64,    extent_offset   )
+               __field(        u8,     extent_type     )
+               __field(        u8,     compression     )
+               __field(        u64,    extent_start    )
+               __field(        u64,    extent_end      )
+       ),
+
+       TP_fast_assign_btrfs(bi->root->fs_info,
+               __entry->root_obj       = bi->root->objectid;
+               __entry->ino            = btrfs_ino(bi);
+               __entry->isize          = bi->vfs_inode.i_size;
+               __entry->disk_isize     = bi->disk_i_size;
+               __entry->num_bytes      = btrfs_file_extent_num_bytes(l, fi);
+               __entry->ram_bytes      = btrfs_file_extent_ram_bytes(l, fi);
+               __entry->disk_bytenr    = btrfs_file_extent_disk_bytenr(l, fi);
+               __entry->disk_num_bytes = btrfs_file_extent_disk_num_bytes(l, fi);
+               __entry->extent_offset  = btrfs_file_extent_offset(l, fi);
+               __entry->extent_type    = btrfs_file_extent_type(l, fi);
+               __entry->compression    = btrfs_file_extent_compression(l, fi);
+               __entry->extent_start   = start;
+               __entry->extent_end     = (start + __entry->num_bytes);
+       ),
+
+       TP_printk_btrfs(
+               "root=%llu(%s) inode=%llu size=%llu disk_isize=%llu "
+               "file extent range=[%llu %llu] "
+               "(num_bytes=%llu ram_bytes=%llu disk_bytenr=%llu "
+               "disk_num_bytes=%llu extent_offset=%llu type=%s "
+               "compression=%u",
+               show_root_type(__entry->root_obj), __entry->ino,
+               __entry->isize,
+               __entry->disk_isize, __entry->extent_start,
+               __entry->extent_end, __entry->num_bytes, __entry->ram_bytes,
+               __entry->disk_bytenr, __entry->disk_num_bytes,
+               __entry->extent_offset, show_fi_type(__entry->extent_type),
+               __entry->compression)
+);
+
+DECLARE_EVENT_CLASS(
+       btrfs__file_extent_item_inline,
+
+       TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l,
+                struct btrfs_file_extent_item *fi, int slot, u64 start),
+
+       TP_ARGS(bi, l, fi, slot,  start),
+
+       TP_STRUCT__entry_btrfs(
+               __field(        u64,    root_obj        )
+               __field(        u64,    ino             )
+               __field(        loff_t, isize           )
+               __field(        u64,    disk_isize      )
+               __field(        u8,     extent_type     )
+               __field(        u8,     compression     )
+               __field(        u64,    extent_start    )
+               __field(        u64,    extent_end      )
+       ),
+
+       TP_fast_assign_btrfs(
+               bi->root->fs_info,
+               __entry->root_obj       = bi->root->objectid;
+               __entry->ino            = btrfs_ino(bi);
+               __entry->isize          = bi->vfs_inode.i_size;
+               __entry->disk_isize     = bi->disk_i_size;
+               __entry->extent_type    = btrfs_file_extent_type(l, fi);
+               __entry->compression    = btrfs_file_extent_compression(l, fi);
+               __entry->extent_start   = start;
+               __entry->extent_end     = (start + btrfs_file_extent_inline_len(l, slot, fi));
+       ),
+
+       TP_printk_btrfs(
+               "root=%llu(%s) inode=%llu size=%llu disk_isize=%llu "
+               "file extent range=[%llu %llu] "
+               "extent_type=%s compression=%u",
+               show_root_type(__entry->root_obj), __entry->ino, __entry->isize,
+               __entry->disk_isize, __entry->extent_start,
+               __entry->extent_end, show_fi_type(__entry->extent_type),
+               __entry->compression)
+);
+
+DEFINE_EVENT(
+       btrfs__file_extent_item_regular, btrfs_get_extent_show_fi_regular,
+
+       TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l,
+                struct btrfs_file_extent_item *fi, u64 start),
+
+       TP_ARGS(bi, l, fi, start)
+);
+
+DEFINE_EVENT(
+       btrfs__file_extent_item_regular, btrfs_truncate_show_fi_regular,
+
+       TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l,
+                struct btrfs_file_extent_item *fi, u64 start),
+
+       TP_ARGS(bi, l, fi, start)
+);
+
+DEFINE_EVENT(
+       btrfs__file_extent_item_inline, btrfs_get_extent_show_fi_inline,
+
+       TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l,
+                struct btrfs_file_extent_item *fi, int slot, u64 start),
+
+       TP_ARGS(bi, l, fi, slot, start)
+);
+
+DEFINE_EVENT(
+       btrfs__file_extent_item_inline, btrfs_truncate_show_fi_inline,
+
+       TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l,
+                struct btrfs_file_extent_item *fi, int slot, u64 start),
+
+       TP_ARGS(bi, l, fi, slot, start)
+);
+
 #define show_ordered_flags(flags)                                         \
        __print_flags(flags, "|",                                          \
                { (1 << BTRFS_ORDERED_IO_DONE),         "IO_DONE"       }, \
@@ -275,7 +415,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent,
                __entry->bytes_left     = ordered->bytes_left;
                __entry->flags          = ordered->flags;
                __entry->compress_type  = ordered->compress_type;
-               __entry->refs           = atomic_read(&ordered->refs);
+               __entry->refs           = refcount_read(&ordered->refs);
                __entry->root_objectid  =
                                BTRFS_I(inode)->root->root_key.objectid;
                __entry->truncated_len  = ordered->truncated_len;
@@ -1475,6 +1615,49 @@ TRACE_EVENT(qgroup_update_counters,
                  __entry->cur_new_count)
 );
 
+TRACE_EVENT(qgroup_update_reserve,
+
+       TP_PROTO(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup,
+                s64 diff),
+
+       TP_ARGS(fs_info, qgroup, diff),
+
+       TP_STRUCT__entry_btrfs(
+               __field(        u64,    qgid                    )
+               __field(        u64,    cur_reserved            )
+               __field(        s64,    diff                    )
+       ),
+
+       TP_fast_assign_btrfs(fs_info,
+               __entry->qgid           = qgroup->qgroupid;
+               __entry->cur_reserved   = qgroup->reserved;
+               __entry->diff           = diff;
+       ),
+
+       TP_printk_btrfs("qgid=%llu cur_reserved=%llu diff=%lld",
+               __entry->qgid, __entry->cur_reserved, __entry->diff)
+);
+
+TRACE_EVENT(qgroup_meta_reserve,
+
+       TP_PROTO(struct btrfs_root *root, s64 diff),
+
+       TP_ARGS(root, diff),
+
+       TP_STRUCT__entry_btrfs(
+               __field(        u64,    refroot                 )
+               __field(        s64,    diff                    )
+       ),
+
+       TP_fast_assign_btrfs(root->fs_info,
+               __entry->refroot        = root->objectid;
+               __entry->diff           = diff;
+       ),
+
+       TP_printk_btrfs("refroot=%llu(%s) diff=%lld",
+               show_root_type(__entry->refroot), __entry->diff)
+);
+
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
index dcfc3a5a9cb1d20f29bbac00c6ef315006e9d208..a456e5309238bbd78466abee16a0da6ab0248e50 100644 (file)
@@ -291,10 +291,10 @@ struct btrfs_ioctl_feature_flags {
 struct btrfs_balance_args {
        __u64 profiles;
        union {
-               __le64 usage;
+               __u64 usage;
                struct {
-                       __le32 usage_min;
-                       __le32 usage_max;
+                       __u32 usage_min;
+                       __u32 usage_max;
                };
        };
        __u64 devid;
@@ -324,8 +324,8 @@ struct btrfs_balance_args {
         * Process chunks that cross stripes_min..stripes_max devices,
         * BTRFS_BALANCE_ARGS_STRIPES_RANGE
         */
-       __le32 stripes_min;
-       __le32 stripes_max;
+       __u32 stripes_min;
+       __u32 stripes_max;
 
        __u64 unused[6];
 } __attribute__ ((__packed__));