btrfs: do reverse path readahead in btrfs_shrink_device
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
index 292266f6ab9c9d8dfa18422998ec1884ae57369d..78f6cd1de33aca95a555ca19961f3da39fb6f3d7 100644 (file)
@@ -40,6 +40,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 1,
                .devs_increment = 2,
                .ncopies        = 2,
+               .raid_name      = "raid10",
+               .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
+               .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
        },
        [BTRFS_RAID_RAID1] = {
                .sub_stripes    = 1,
@@ -49,6 +52,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 1,
                .devs_increment = 2,
                .ncopies        = 2,
+               .raid_name      = "raid1",
+               .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
+               .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
        },
        [BTRFS_RAID_DUP] = {
                .sub_stripes    = 1,
@@ -58,6 +64,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 0,
                .devs_increment = 1,
                .ncopies        = 2,
+               .raid_name      = "dup",
+               .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
+               .mindev_error   = 0,
        },
        [BTRFS_RAID_RAID0] = {
                .sub_stripes    = 1,
@@ -67,6 +76,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 0,
                .devs_increment = 1,
                .ncopies        = 1,
+               .raid_name      = "raid0",
+               .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
+               .mindev_error   = 0,
        },
        [BTRFS_RAID_SINGLE] = {
                .sub_stripes    = 1,
@@ -76,6 +88,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 0,
                .devs_increment = 1,
                .ncopies        = 1,
+               .raid_name      = "single",
+               .bg_flag        = 0,
+               .mindev_error   = 0,
        },
        [BTRFS_RAID_RAID5] = {
                .sub_stripes    = 1,
@@ -85,6 +100,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 1,
                .devs_increment = 1,
                .ncopies        = 2,
+               .raid_name      = "raid5",
+               .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
+               .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
        },
        [BTRFS_RAID_RAID6] = {
                .sub_stripes    = 1,
@@ -94,33 +112,19 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 2,
                .devs_increment = 1,
                .ncopies        = 3,
+               .raid_name      = "raid6",
+               .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
+               .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
        },
 };
 
-const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
-       [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
-       [BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
-       [BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
-       [BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
-       [BTRFS_RAID_SINGLE] = 0,
-       [BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
-       [BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
-};
+const char *get_raid_name(enum btrfs_raid_types type)
+{
+       if (type >= BTRFS_NR_RAID_TYPES)
+               return NULL;
 
-/*
- * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
- * condition is not met. Zero means there's no corresponding
- * BTRFS_ERROR_DEV_*_NOT_MET value.
- */
-const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
-       [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
-       [BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
-       [BTRFS_RAID_DUP]    = 0,
-       [BTRFS_RAID_RAID0]  = 0,
-       [BTRFS_RAID_SINGLE] = 0,
-       [BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
-       [BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
-};
+       return btrfs_raid_array[type].raid_name;
+}
 
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                struct btrfs_fs_info *fs_info);
@@ -167,12 +171,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
  * may be used to exclude some operations from running concurrently without any
  * modifications to the list (see write_all_supers)
  *
- * volume_mutex
- * ------------
- * coarse lock owned by a mounted filesystem; used to exclude some operations
- * that cannot run in parallel and affect the higher-level properties of the
- * filesystem like: device add/deleting/resize/replace, or balance
- *
  * balance_mutex
  * -------------
  * protects balance structures (status, state) and context accessed from
@@ -197,6 +195,41 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
  *     device_list_mutex
  *       chunk_mutex
  *     balance_mutex
+ *
+ *
+ * Exclusive operations, BTRFS_FS_EXCL_OP
+ * ======================================
+ *
+ * Maintains the exclusivity of the following operations that apply to the
+ * whole filesystem and cannot run in parallel.
+ *
+ * - Balance (*)
+ * - Device add
+ * - Device remove
+ * - Device replace (*)
+ * - Resize
+ *
+ * The device operations (as above) can be in one of the following states:
+ *
+ * - Running state
+ * - Paused state
+ * - Completed state
+ *
+ * Only device operations marked with (*) can go into the Paused state for the
+ * following reasons:
+ *
+ * - ioctl (only Balance can be Paused through ioctl)
+ * - filesystem remounted as read-only
+ * - filesystem unmounted and mounted as read-only
+ * - system power-cycle and filesystem mounted as read-only
+ * - filesystem or device errors leading to forced read-only
+ *
+ * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
+ * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
+ * A device operation in Paused or Running state can be canceled or resumed
+ * either by ioctl (Balance only) or when remounted as read-write.
+ * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
+ * completed.
  */
 
 DEFINE_MUTEX(uuid_mutex);
@@ -227,14 +260,14 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
        INIT_LIST_HEAD(&fs_devs->devices);
        INIT_LIST_HEAD(&fs_devs->resized_devices);
        INIT_LIST_HEAD(&fs_devs->alloc_list);
-       INIT_LIST_HEAD(&fs_devs->list);
+       INIT_LIST_HEAD(&fs_devs->fs_list);
        if (fsid)
                memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
 
        return fs_devs;
 }
 
-static void free_device(struct btrfs_device *device)
+void btrfs_free_device(struct btrfs_device *device)
 {
        rcu_string_free(device->name);
        bio_put(device->flush_bio);
@@ -249,7 +282,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
                device = list_entry(fs_devices->devices.next,
                                    struct btrfs_device, dev_list);
                list_del(&device->dev_list);
-               free_device(device);
+               btrfs_free_device(device);
        }
        kfree(fs_devices);
 }
@@ -273,8 +306,8 @@ void __exit btrfs_cleanup_fs_uuids(void)
 
        while (!list_empty(&fs_uuids)) {
                fs_devices = list_entry(fs_uuids.next,
-                                       struct btrfs_fs_devices, list);
-               list_del(&fs_devices->list);
+                                       struct btrfs_fs_devices, fs_list);
+               list_del(&fs_devices->fs_list);
                free_fs_devices(fs_devices);
        }
 }
@@ -282,7 +315,7 @@ void __exit btrfs_cleanup_fs_uuids(void)
 /*
  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
  * Returned struct is not linked onto any lists and must be destroyed using
- * free_device.
+ * btrfs_free_device.
  */
 static struct btrfs_device *__alloc_device(void)
 {
@@ -327,10 +360,9 @@ static struct btrfs_device *__alloc_device(void)
 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
                u64 devid, const u8 *uuid)
 {
-       struct list_head *head = &fs_devices->devices;
        struct btrfs_device *dev;
 
-       list_for_each_entry(dev, head, dev_list) {
+       list_for_each_entry(dev, &fs_devices->devices, dev_list) {
                if (dev->devid == devid &&
                    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
                        return dev;
@@ -343,7 +375,7 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 {
        struct btrfs_fs_devices *fs_devices;
 
-       list_for_each_entry(fs_devices, &fs_uuids, list) {
+       list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
                if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
                        return fs_devices;
        }
@@ -607,7 +639,7 @@ static void btrfs_free_stale_devices(const char *path,
        struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
        struct btrfs_device *dev, *tmp_dev;
 
-       list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) {
+       list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) {
 
                if (fs_devs->opened)
                        continue;
@@ -632,13 +664,13 @@ static void btrfs_free_stale_devices(const char *path,
                        /* delete the stale device */
                        if (fs_devs->num_devices == 1) {
                                btrfs_sysfs_remove_fsid(fs_devs);
-                               list_del(&fs_devs->list);
+                               list_del(&fs_devs->fs_list);
                                free_fs_devices(fs_devs);
                                break;
                        } else {
                                fs_devs->num_devices--;
                                list_del(&dev->dev_list);
-                               free_device(dev);
+                               btrfs_free_device(dev);
                        }
                }
        }
@@ -732,7 +764,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
                if (IS_ERR(fs_devices))
                        return ERR_CAST(fs_devices);
 
-               list_add(&fs_devices->list, &fs_uuids);
+               list_add(&fs_devices->fs_list, &fs_uuids);
 
                device = NULL;
        } else {
@@ -753,7 +785,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 
                name = rcu_string_strdup(path, GFP_NOFS);
                if (!name) {
-                       free_device(device);
+                       btrfs_free_device(device);
                        return ERR_PTR(-ENOMEM);
                }
                rcu_assign_pointer(device->name, name);
@@ -866,7 +898,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
                        name = rcu_string_strdup(orig_dev->name->str,
                                        GFP_KERNEL);
                        if (!name) {
-                               free_device(device);
+                               btrfs_free_device(device);
                                goto error;
                        }
                        rcu_assign_pointer(device->name, name);
@@ -938,7 +970,7 @@ again:
                }
                list_del_init(&device->dev_list);
                fs_devices->num_devices--;
-               free_device(device);
+               btrfs_free_device(device);
        }
 
        if (fs_devices->seed) {
@@ -956,7 +988,7 @@ static void free_device_rcu(struct rcu_head *head)
        struct btrfs_device *device;
 
        device = container_of(head, struct btrfs_device, rcu);
-       free_device(device);
+       btrfs_free_device(device);
 }
 
 static void btrfs_close_bdev(struct btrfs_device *device)
@@ -1005,7 +1037,7 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device)
        new_device->fs_devices = device->fs_devices;
 }
 
-static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
 {
        struct btrfs_device *device, *tmp;
        struct list_head pending_put;
@@ -1050,7 +1082,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
        int ret;
 
        mutex_lock(&uuid_mutex);
-       ret = __btrfs_close_devices(fs_devices);
+       ret = close_fs_devices(fs_devices);
        if (!fs_devices->opened) {
                seed_devices = fs_devices->seed;
                fs_devices->seed = NULL;
@@ -1060,23 +1092,22 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
        while (seed_devices) {
                fs_devices = seed_devices;
                seed_devices = fs_devices->seed;
-               __btrfs_close_devices(fs_devices);
+               close_fs_devices(fs_devices);
                free_fs_devices(fs_devices);
        }
        return ret;
 }
 
-static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
                                fmode_t flags, void *holder)
 {
-       struct list_head *head = &fs_devices->devices;
        struct btrfs_device *device;
        struct btrfs_device *latest_dev = NULL;
        int ret = 0;
 
        flags |= FMODE_EXCL;
 
-       list_for_each_entry(device, head, dev_list) {
+       list_for_each_entry(device, &fs_devices->devices, dev_list) {
                /* Just open everything we can; ignore failures here */
                if (btrfs_open_one_device(fs_devices, device, flags, holder))
                        continue;
@@ -1121,7 +1152,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                ret = 0;
        } else {
                list_sort(NULL, &fs_devices->devices, devid_cmp);
-               ret = __btrfs_open_devices(fs_devices, flags, holder);
+               ret = open_fs_devices(fs_devices, flags, holder);
        }
        mutex_unlock(&uuid_mutex);
        return ret;
@@ -1857,11 +1888,11 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
        } while (read_seqretry(&fs_info->profiles_lock, seq));
 
        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
-               if (!(all_avail & btrfs_raid_group[i]))
+               if (!(all_avail & btrfs_raid_array[i].bg_flag))
                        continue;
 
                if (num_devices < btrfs_raid_array[i].devs_min) {
-                       int ret = btrfs_raid_mindev_error[i];
+                       int ret = btrfs_raid_array[i].mindev_error;
 
                        if (ret)
                                return ret;
@@ -1917,13 +1948,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 {
        struct btrfs_device *device;
        struct btrfs_fs_devices *cur_devices;
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        u64 num_devices;
        int ret = 0;
 
-       mutex_lock(&fs_info->volume_mutex);
        mutex_lock(&uuid_mutex);
 
-       num_devices = fs_info->fs_devices->num_devices;
+       num_devices = fs_devices->num_devices;
        btrfs_dev_replace_read_lock(&fs_info->dev_replace);
        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
                WARN_ON(num_devices < 1);
@@ -1987,7 +2018,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
         */
 
        cur_devices = device->fs_devices;
-       mutex_lock(&fs_info->fs_devices->device_list_mutex);
+       mutex_lock(&fs_devices->device_list_mutex);
        list_del_rcu(&device->dev_list);
 
        device->fs_devices->num_devices--;
@@ -2001,12 +2032,12 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
        if (device->bdev) {
                device->fs_devices->open_devices--;
                /* remove sysfs entry */
-               btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
+               btrfs_sysfs_rm_device_link(fs_devices, device);
        }
 
        num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
        btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
-       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+       mutex_unlock(&fs_devices->device_list_mutex);
 
        /*
         * at this point, the device is zero sized and detached from
@@ -2020,8 +2051,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
        call_rcu(&device->rcu, free_device_rcu);
 
        if (cur_devices->open_devices == 0) {
-               struct btrfs_fs_devices *fs_devices;
-               fs_devices = fs_info->fs_devices;
                while (fs_devices) {
                        if (fs_devices->seed == cur_devices) {
                                fs_devices->seed = cur_devices->seed;
@@ -2030,20 +2059,19 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
                        fs_devices = fs_devices->seed;
                }
                cur_devices->seed = NULL;
-               __btrfs_close_devices(cur_devices);
+               close_fs_devices(cur_devices);
                free_fs_devices(cur_devices);
        }
 
 out:
        mutex_unlock(&uuid_mutex);
-       mutex_unlock(&fs_info->volume_mutex);
        return ret;
 
 error_undo:
        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
                mutex_lock(&fs_info->chunk_mutex);
                list_add(&device->dev_alloc_list,
-                        &fs_info->fs_devices->alloc_list);
+                        &fs_devices->alloc_list);
                device->fs_devices->rw_devices++;
                mutex_unlock(&fs_info->chunk_mutex);
        }
@@ -2112,7 +2140,7 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
                        tmp_fs_devices = tmp_fs_devices->seed;
                }
                fs_devices->seed = NULL;
-               __btrfs_close_devices(fs_devices);
+               close_fs_devices(fs_devices);
                free_fs_devices(fs_devices);
        }
 }
@@ -2188,10 +2216,6 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
                struct btrfs_device *tmp;
 
                devices = &fs_info->fs_devices->devices;
-               /*
-                * It is safe to read the devices since the volume_mutex
-                * is held by the caller.
-                */
                list_for_each_entry(tmp, devices, dev_list) {
                        if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
                                        &tmp->dev_state) && !tmp->bdev) {
@@ -2259,7 +2283,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
                return PTR_ERR(old_devices);
        }
 
-       list_add(&old_devices->list, &fs_uuids);
+       list_add(&old_devices->fs_list, &fs_uuids);
 
        memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
        seed_devices->opened = 1;
@@ -2570,7 +2594,7 @@ error_trans:
        if (trans)
                btrfs_end_transaction(trans);
 error_free_device:
-       free_device(device);
+       btrfs_free_device(device);
 error:
        blkdev_put(bdev, FMODE_EXCL);
        if (seeding_dev && !unlocked) {
@@ -2580,99 +2604,6 @@ error:
        return ret;
 }
 
-int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
-                                 const char *device_path,
-                                 struct btrfs_device *srcdev,
-                                 struct btrfs_device **device_out)
-{
-       struct btrfs_device *device;
-       struct block_device *bdev;
-       struct list_head *devices;
-       struct rcu_string *name;
-       u64 devid = BTRFS_DEV_REPLACE_DEVID;
-       int ret = 0;
-
-       *device_out = NULL;
-       if (fs_info->fs_devices->seeding) {
-               btrfs_err(fs_info, "the filesystem is a seed filesystem!");
-               return -EINVAL;
-       }
-
-       bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
-                                 fs_info->bdev_holder);
-       if (IS_ERR(bdev)) {
-               btrfs_err(fs_info, "target device %s is invalid!", device_path);
-               return PTR_ERR(bdev);
-       }
-
-       filemap_write_and_wait(bdev->bd_inode->i_mapping);
-
-       devices = &fs_info->fs_devices->devices;
-       list_for_each_entry(device, devices, dev_list) {
-               if (device->bdev == bdev) {
-                       btrfs_err(fs_info,
-                                 "target device is in the filesystem!");
-                       ret = -EEXIST;
-                       goto error;
-               }
-       }
-
-
-       if (i_size_read(bdev->bd_inode) <
-           btrfs_device_get_total_bytes(srcdev)) {
-               btrfs_err(fs_info,
-                         "target device is smaller than source device!");
-               ret = -EINVAL;
-               goto error;
-       }
-
-
-       device = btrfs_alloc_device(NULL, &devid, NULL);
-       if (IS_ERR(device)) {
-               ret = PTR_ERR(device);
-               goto error;
-       }
-
-       name = rcu_string_strdup(device_path, GFP_KERNEL);
-       if (!name) {
-               free_device(device);
-               ret = -ENOMEM;
-               goto error;
-       }
-       rcu_assign_pointer(device->name, name);
-
-       mutex_lock(&fs_info->fs_devices->device_list_mutex);
-       set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
-       device->generation = 0;
-       device->io_width = fs_info->sectorsize;
-       device->io_align = fs_info->sectorsize;
-       device->sector_size = fs_info->sectorsize;
-       device->total_bytes = btrfs_device_get_total_bytes(srcdev);
-       device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
-       device->bytes_used = btrfs_device_get_bytes_used(srcdev);
-       device->commit_total_bytes = srcdev->commit_total_bytes;
-       device->commit_bytes_used = device->bytes_used;
-       device->fs_info = fs_info;
-       device->bdev = bdev;
-       set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
-       set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
-       device->mode = FMODE_EXCL;
-       device->dev_stats_valid = 1;
-       set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
-       device->fs_devices = fs_info->fs_devices;
-       list_add(&device->dev_list, &fs_info->fs_devices->devices);
-       fs_info->fs_devices->num_devices++;
-       fs_info->fs_devices->open_devices++;
-       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-
-       *device_out = device;
-       return ret;
-
-error:
-       blkdev_put(bdev, FMODE_EXCL);
-       return ret;
-}
-
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
                                        struct btrfs_device *device)
 {
@@ -3273,24 +3204,12 @@ static void update_balance_args(struct btrfs_balance_control *bctl)
 }
 
 /*
- * Should be called with both balance and volume mutexes held to
- * serialize other volume operations (add_dev/rm_dev/resize) with
- * restriper.  Same goes for unset_balance_control.
+ * Clear the balance status in fs_info and delete the balance item from disk.
  */
-static void set_balance_control(struct btrfs_balance_control *bctl)
-{
-       struct btrfs_fs_info *fs_info = bctl->fs_info;
-
-       BUG_ON(fs_info->balance_ctl);
-
-       spin_lock(&fs_info->balance_lock);
-       fs_info->balance_ctl = bctl;
-       spin_unlock(&fs_info->balance_lock);
-}
-
-static void unset_balance_control(struct btrfs_fs_info *fs_info)
+static void reset_balance_state(struct btrfs_fs_info *fs_info)
 {
        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+       int ret;
 
        BUG_ON(!fs_info->balance_ctl);
 
@@ -3299,6 +3218,9 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info)
        spin_unlock(&fs_info->balance_lock);
 
        kfree(bctl);
+       ret = del_balance_item(fs_info);
+       if (ret)
+               btrfs_handle_fs_error(fs_info, ret, NULL);
 }
 
 /*
@@ -3835,18 +3757,6 @@ static inline int balance_need_close(struct btrfs_fs_info *fs_info)
                 atomic_read(&fs_info->balance_cancel_req) == 0);
 }
 
-static void __cancel_balance(struct btrfs_fs_info *fs_info)
-{
-       int ret;
-
-       unset_balance_control(fs_info);
-       ret = del_balance_item(fs_info);
-       if (ret)
-               btrfs_handle_fs_error(fs_info, ret, NULL);
-
-       clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
-}
-
 /* Non-zero return value signifies invalidity */
 static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
                u64 allowed)
@@ -3857,7 +3767,7 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
 }
 
 /*
- * Should be called with both balance and volume mutexes held
+ * Should be called with balance mutexe held
  */
 int btrfs_balance(struct btrfs_balance_control *bctl,
                  struct btrfs_ioctl_balance_args *bargs)
@@ -3978,7 +3888,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 
        if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
                BUG_ON(ret == -EEXIST);
-               set_balance_control(bctl);
+               BUG_ON(fs_info->balance_ctl);
+               spin_lock(&fs_info->balance_lock);
+               fs_info->balance_ctl = bctl;
+               spin_unlock(&fs_info->balance_lock);
        } else {
                BUG_ON(ret != -EEXIST);
                spin_lock(&fs_info->balance_lock);
@@ -3986,22 +3899,24 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                spin_unlock(&fs_info->balance_lock);
        }
 
-       atomic_inc(&fs_info->balance_running);
+       ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
+       set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
        mutex_unlock(&fs_info->balance_mutex);
 
        ret = __btrfs_balance(fs_info);
 
        mutex_lock(&fs_info->balance_mutex);
-       atomic_dec(&fs_info->balance_running);
+       clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
 
        if (bargs) {
                memset(bargs, 0, sizeof(*bargs));
-               update_ioctl_balance_args(fs_info, 0, bargs);
+               btrfs_update_ioctl_balance_args(fs_info, bargs);
        }
 
        if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
            balance_need_close(fs_info)) {
-               __cancel_balance(fs_info);
+               reset_balance_state(fs_info);
+               clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
        }
 
        wake_up(&fs_info->balance_wait_q);
@@ -4009,11 +3924,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        return ret;
 out:
        if (bctl->flags & BTRFS_BALANCE_RESUME)
-               __cancel_balance(fs_info);
-       else {
+               reset_balance_state(fs_info);
+       else
                kfree(bctl);
-               clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
-       }
+       clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+
        return ret;
 }
 
@@ -4022,16 +3937,12 @@ static int balance_kthread(void *data)
        struct btrfs_fs_info *fs_info = data;
        int ret = 0;
 
-       mutex_lock(&fs_info->volume_mutex);
        mutex_lock(&fs_info->balance_mutex);
-
        if (fs_info->balance_ctl) {
                btrfs_info(fs_info, "continuing balance");
                ret = btrfs_balance(fs_info->balance_ctl, NULL);
        }
-
        mutex_unlock(&fs_info->balance_mutex);
-       mutex_unlock(&fs_info->volume_mutex);
 
        return ret;
 }
@@ -4040,18 +3951,27 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
 {
        struct task_struct *tsk;
 
-       spin_lock(&fs_info->balance_lock);
+       mutex_lock(&fs_info->balance_mutex);
        if (!fs_info->balance_ctl) {
-               spin_unlock(&fs_info->balance_lock);
+               mutex_unlock(&fs_info->balance_mutex);
                return 0;
        }
-       spin_unlock(&fs_info->balance_lock);
+       mutex_unlock(&fs_info->balance_mutex);
 
        if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
                btrfs_info(fs_info, "force skipping balance");
                return 0;
        }
 
+       /*
+        * A ro->rw remount sequence should continue with the paused balance
+        * regardless of who pauses it, system or the user as of now, so set
+        * the resume flag.
+        */
+       spin_lock(&fs_info->balance_lock);
+       fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
+       spin_unlock(&fs_info->balance_lock);
+
        tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
        return PTR_ERR_OR_ZERO(tsk);
 }
@@ -4102,15 +4022,26 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
        btrfs_balance_sys(leaf, item, &disk_bargs);
        btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
 
-       WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
+       /*
+        * This should never happen, as the paused balance state is recovered
+        * during mount without any chance of other exclusive ops to collide.
+        *
+        * This gives the exclusive op status to balance and keeps in paused
+        * state until user intervention (cancel or umount). If the ownership
+        * cannot be assigned, show a message but do not fail. The balance
+        * is in a paused state and must have fs_info::balance_ctl properly
+        * set up.
+        */
+       if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+               btrfs_warn(fs_info,
+       "cannot set exclusive op status to balance, resume manually");
 
-       mutex_lock(&fs_info->volume_mutex);
        mutex_lock(&fs_info->balance_mutex);
-
-       set_balance_control(bctl);
-
+       BUG_ON(fs_info->balance_ctl);
+       spin_lock(&fs_info->balance_lock);
+       fs_info->balance_ctl = bctl;
+       spin_unlock(&fs_info->balance_lock);
        mutex_unlock(&fs_info->balance_mutex);
-       mutex_unlock(&fs_info->volume_mutex);
 out:
        btrfs_free_path(path);
        return ret;
@@ -4126,16 +4057,16 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
                return -ENOTCONN;
        }
 
-       if (atomic_read(&fs_info->balance_running)) {
+       if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
                atomic_inc(&fs_info->balance_pause_req);
                mutex_unlock(&fs_info->balance_mutex);
 
                wait_event(fs_info->balance_wait_q,
-                          atomic_read(&fs_info->balance_running) == 0);
+                          !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
 
                mutex_lock(&fs_info->balance_mutex);
                /* we are good with balance_ctl ripped off from under us */
-               BUG_ON(atomic_read(&fs_info->balance_running));
+               BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
                atomic_dec(&fs_info->balance_pause_req);
        } else {
                ret = -ENOTCONN;
@@ -4147,38 +4078,48 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
 
 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
 {
-       if (sb_rdonly(fs_info->sb))
-               return -EROFS;
-
        mutex_lock(&fs_info->balance_mutex);
        if (!fs_info->balance_ctl) {
                mutex_unlock(&fs_info->balance_mutex);
                return -ENOTCONN;
        }
 
+       /*
+        * A paused balance with the item stored on disk can be resumed at
+        * mount time if the mount is read-write. Otherwise it's still paused
+        * and we must not allow cancelling as it deletes the item.
+        */
+       if (sb_rdonly(fs_info->sb)) {
+               mutex_unlock(&fs_info->balance_mutex);
+               return -EROFS;
+       }
+
        atomic_inc(&fs_info->balance_cancel_req);
        /*
         * if we are running just wait and return, balance item is
         * deleted in btrfs_balance in this case
         */
-       if (atomic_read(&fs_info->balance_running)) {
+       if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
                mutex_unlock(&fs_info->balance_mutex);
                wait_event(fs_info->balance_wait_q,
-                          atomic_read(&fs_info->balance_running) == 0);
+                          !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
                mutex_lock(&fs_info->balance_mutex);
        } else {
-               /* __cancel_balance needs volume_mutex */
                mutex_unlock(&fs_info->balance_mutex);
-               mutex_lock(&fs_info->volume_mutex);
+               /*
+                * Lock released to allow other waiters to continue, we'll
+                * reexamine the status again.
+                */
                mutex_lock(&fs_info->balance_mutex);
 
-               if (fs_info->balance_ctl)
-                       __cancel_balance(fs_info);
-
-               mutex_unlock(&fs_info->volume_mutex);
+               if (fs_info->balance_ctl) {
+                       reset_balance_state(fs_info);
+                       clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+               }
        }
 
-       BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+       BUG_ON(fs_info->balance_ctl ||
+               test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
        atomic_dec(&fs_info->balance_cancel_req);
        mutex_unlock(&fs_info->balance_mutex);
        return 0;
@@ -4473,7 +4414,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
        if (!path)
                return -ENOMEM;
 
-       path->reada = READA_FORWARD;
+       path->reada = READA_BACK;
 
        mutex_lock(&fs_info->chunk_mutex);
 
@@ -6401,7 +6342,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
  *
  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
  * on error.  Returned struct is not linked onto any lists and must be
- * destroyed with free_device.
+ * destroyed with btrfs_free_device.
  */
 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
                                        const u64 *devid,
@@ -6424,7 +6365,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 
                ret = find_next_devid(fs_info, &tmp);
                if (ret) {
-                       free_device(dev);
+                       btrfs_free_device(dev);
                        return ERR_PTR(ret);
                }
        }
@@ -6675,8 +6616,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
        if (IS_ERR(fs_devices))
                return fs_devices;
 
-       ret = __btrfs_open_devices(fs_devices, FMODE_READ,
-                                  fs_info->bdev_holder);
+       ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
        if (ret) {
                free_fs_devices(fs_devices);
                fs_devices = ERR_PTR(ret);
@@ -6684,7 +6624,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
        }
 
        if (!fs_devices->seeding) {
-               __btrfs_close_devices(fs_devices);
+               close_fs_devices(fs_devices);
                free_fs_devices(fs_devices);
                fs_devices = ERR_PTR(-EINVAL);
                goto out;