btrfs: use existing cur_devices, cleanup btrfs_rm_device
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
index 07706c0a5781045d7310ed3a4eae9648831adbdc..a382d53c560af673acb3424de1620dd92ac6ab27 100644 (file)
@@ -40,6 +40,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 1,
                .devs_increment = 2,
                .ncopies        = 2,
+               .raid_name      = "raid10",
+               .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
+               .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
        },
        [BTRFS_RAID_RAID1] = {
                .sub_stripes    = 1,
@@ -49,6 +52,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 1,
                .devs_increment = 2,
                .ncopies        = 2,
+               .raid_name      = "raid1",
+               .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
+               .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
        },
        [BTRFS_RAID_DUP] = {
                .sub_stripes    = 1,
@@ -58,6 +64,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 0,
                .devs_increment = 1,
                .ncopies        = 2,
+               .raid_name      = "dup",
+               .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
+               .mindev_error   = 0,
        },
        [BTRFS_RAID_RAID0] = {
                .sub_stripes    = 1,
@@ -67,6 +76,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 0,
                .devs_increment = 1,
                .ncopies        = 1,
+               .raid_name      = "raid0",
+               .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
+               .mindev_error   = 0,
        },
        [BTRFS_RAID_SINGLE] = {
                .sub_stripes    = 1,
@@ -76,6 +88,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 0,
                .devs_increment = 1,
                .ncopies        = 1,
+               .raid_name      = "single",
+               .bg_flag        = 0,
+               .mindev_error   = 0,
        },
        [BTRFS_RAID_RAID5] = {
                .sub_stripes    = 1,
@@ -85,6 +100,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 1,
                .devs_increment = 1,
                .ncopies        = 2,
+               .raid_name      = "raid5",
+               .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
+               .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
        },
        [BTRFS_RAID_RAID6] = {
                .sub_stripes    = 1,
@@ -94,33 +112,19 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 2,
                .devs_increment = 1,
                .ncopies        = 3,
+               .raid_name      = "raid6",
+               .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
+               .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
        },
 };
 
-const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
-       [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
-       [BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
-       [BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
-       [BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
-       [BTRFS_RAID_SINGLE] = 0,
-       [BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
-       [BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
-};
+const char *get_raid_name(enum btrfs_raid_types type)
+{
+       if (type >= BTRFS_NR_RAID_TYPES)
+               return NULL;
 
-/*
- * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
- * condition is not met. Zero means there's no corresponding
- * BTRFS_ERROR_DEV_*_NOT_MET value.
- */
-const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
-       [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
-       [BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
-       [BTRFS_RAID_DUP]    = 0,
-       [BTRFS_RAID_RAID0]  = 0,
-       [BTRFS_RAID_SINGLE] = 0,
-       [BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
-       [BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
-};
+       return btrfs_raid_array[type].raid_name;
+}
 
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                struct btrfs_fs_info *fs_info);
@@ -167,12 +171,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
  * may be used to exclude some operations from running concurrently without any
  * modifications to the list (see write_all_supers)
  *
- * volume_mutex
- * ------------
- * coarse lock owned by a mounted filesystem; used to exclude some operations
- * that cannot run in parallel and affect the higher-level properties of the
- * filesystem like: device add/deleting/resize/replace, or balance
- *
  * balance_mutex
  * -------------
  * protects balance structures (status, state) and context accessed from
@@ -1234,31 +1232,29 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
         */
        bytenr = btrfs_sb_offset(0);
        flags |= FMODE_EXCL;
-       mutex_lock(&uuid_mutex);
 
        bdev = blkdev_get_by_path(path, flags, holder);
-       if (IS_ERR(bdev)) {
-               ret = PTR_ERR(bdev);
-               goto error;
-       }
+       if (IS_ERR(bdev))
+               return PTR_ERR(bdev);
 
        if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
                ret = -EINVAL;
                goto error_bdev_put;
        }
 
+       mutex_lock(&uuid_mutex);
        device = device_list_add(path, disk_super);
        if (IS_ERR(device))
                ret = PTR_ERR(device);
        else
                *fs_devices_ret = device->fs_devices;
+       mutex_unlock(&uuid_mutex);
 
        btrfs_release_disk_super(page);
 
 error_bdev_put:
        blkdev_put(bdev, flags);
-error:
-       mutex_unlock(&uuid_mutex);
+
        return ret;
 }
 
@@ -1890,11 +1886,11 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
        } while (read_seqretry(&fs_info->profiles_lock, seq));
 
        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
-               if (!(all_avail & btrfs_raid_group[i]))
+               if (!(all_avail & btrfs_raid_array[i].bg_flag))
                        continue;
 
                if (num_devices < btrfs_raid_array[i].devs_min) {
-                       int ret = btrfs_raid_mindev_error[i];
+                       int ret = btrfs_raid_array[i].mindev_error;
 
                        if (ret)
                                return ret;
@@ -2019,20 +2015,25 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
         * (super_copy) should hold the device list mutex.
         */
 
+       /*
+        * In normal cases the cur_devices == fs_devices. But in case
+        * of deleting a seed device, the cur_devices should point to
+        * its own fs_devices listed under the fs_devices->seed.
+        */
        cur_devices = device->fs_devices;
        mutex_lock(&fs_devices->device_list_mutex);
        list_del_rcu(&device->dev_list);
 
-       device->fs_devices->num_devices--;
-       device->fs_devices->total_devices--;
+       cur_devices->num_devices--;
+       cur_devices->total_devices--;
 
        if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
-               device->fs_devices->missing_devices--;
+               cur_devices->missing_devices--;
 
        btrfs_assign_next_active_device(fs_info, device, NULL);
 
        if (device->bdev) {
-               device->fs_devices->open_devices--;
+               cur_devices->open_devices--;
                /* remove sysfs entry */
                btrfs_sysfs_rm_device_link(fs_devices, device);
        }
@@ -2218,10 +2219,6 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
                struct btrfs_device *tmp;
 
                devices = &fs_info->fs_devices->devices;
-               /*
-                * It is safe to read the devices since the volume_mutex
-                * is held by the caller.
-                */
                list_for_each_entry(tmp, devices, dev_list) {
                        if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
                                        &tmp->dev_state) && !tmp->bdev) {
@@ -3209,22 +3206,6 @@ static void update_balance_args(struct btrfs_balance_control *bctl)
        }
 }
 
-/*
- * Should be called with both balance and volume mutexes held to
- * serialize other volume operations (add_dev/rm_dev/resize) with
- * restriper.  Same goes for reset_balance_state.
- */
-static void set_balance_control(struct btrfs_balance_control *bctl)
-{
-       struct btrfs_fs_info *fs_info = bctl->fs_info;
-
-       BUG_ON(fs_info->balance_ctl);
-
-       spin_lock(&fs_info->balance_lock);
-       fs_info->balance_ctl = bctl;
-       spin_unlock(&fs_info->balance_lock);
-}
-
 /*
  * Clear the balance status in fs_info and delete the balance item from disk.
  */
@@ -3789,12 +3770,12 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
 }
 
 /*
- * Should be called with both balance and volume mutexes held
+ * Should be called with balance mutexe held
  */
-int btrfs_balance(struct btrfs_balance_control *bctl,
+int btrfs_balance(struct btrfs_fs_info *fs_info,
+                 struct btrfs_balance_control *bctl,
                  struct btrfs_ioctl_balance_args *bargs)
 {
-       struct btrfs_fs_info *fs_info = bctl->fs_info;
        u64 meta_target, data_target;
        u64 allowed;
        int mixed = 0;
@@ -3910,7 +3891,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 
        if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
                BUG_ON(ret == -EEXIST);
-               set_balance_control(bctl);
+               BUG_ON(fs_info->balance_ctl);
+               spin_lock(&fs_info->balance_lock);
+               fs_info->balance_ctl = bctl;
+               spin_unlock(&fs_info->balance_lock);
        } else {
                BUG_ON(ret != -EEXIST);
                spin_lock(&fs_info->balance_lock);
@@ -3918,17 +3902,18 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                spin_unlock(&fs_info->balance_lock);
        }
 
-       atomic_inc(&fs_info->balance_running);
+       ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
+       set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
        mutex_unlock(&fs_info->balance_mutex);
 
        ret = __btrfs_balance(fs_info);
 
        mutex_lock(&fs_info->balance_mutex);
-       atomic_dec(&fs_info->balance_running);
+       clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
 
        if (bargs) {
                memset(bargs, 0, sizeof(*bargs));
-               update_ioctl_balance_args(fs_info, 0, bargs);
+               btrfs_update_ioctl_balance_args(fs_info, bargs);
        }
 
        if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
@@ -3955,16 +3940,12 @@ static int balance_kthread(void *data)
        struct btrfs_fs_info *fs_info = data;
        int ret = 0;
 
-       mutex_lock(&fs_info->volume_mutex);
        mutex_lock(&fs_info->balance_mutex);
-
        if (fs_info->balance_ctl) {
                btrfs_info(fs_info, "continuing balance");
-               ret = btrfs_balance(fs_info->balance_ctl, NULL);
+               ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
        }
-
        mutex_unlock(&fs_info->balance_mutex);
-       mutex_unlock(&fs_info->volume_mutex);
 
        return ret;
 }
@@ -3973,12 +3954,12 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
 {
        struct task_struct *tsk;
 
-       spin_lock(&fs_info->balance_lock);
+       mutex_lock(&fs_info->balance_mutex);
        if (!fs_info->balance_ctl) {
-               spin_unlock(&fs_info->balance_lock);
+               mutex_unlock(&fs_info->balance_mutex);
                return 0;
        }
-       spin_unlock(&fs_info->balance_lock);
+       mutex_unlock(&fs_info->balance_mutex);
 
        if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
                btrfs_info(fs_info, "force skipping balance");
@@ -4033,7 +4014,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
        leaf = path->nodes[0];
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
 
-       bctl->fs_info = fs_info;
        bctl->flags = btrfs_balance_flags(leaf, item);
        bctl->flags |= BTRFS_BALANCE_RESUME;
 
@@ -4058,13 +4038,12 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
                btrfs_warn(fs_info,
        "cannot set exclusive op status to balance, resume manually");
 
-       mutex_lock(&fs_info->volume_mutex);
        mutex_lock(&fs_info->balance_mutex);
-
-       set_balance_control(bctl);
-
+       BUG_ON(fs_info->balance_ctl);
+       spin_lock(&fs_info->balance_lock);
+       fs_info->balance_ctl = bctl;
+       spin_unlock(&fs_info->balance_lock);
        mutex_unlock(&fs_info->balance_mutex);
-       mutex_unlock(&fs_info->volume_mutex);
 out:
        btrfs_free_path(path);
        return ret;
@@ -4080,16 +4059,16 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
                return -ENOTCONN;
        }
 
-       if (atomic_read(&fs_info->balance_running)) {
+       if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
                atomic_inc(&fs_info->balance_pause_req);
                mutex_unlock(&fs_info->balance_mutex);
 
                wait_event(fs_info->balance_wait_q,
-                          atomic_read(&fs_info->balance_running) == 0);
+                          !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
 
                mutex_lock(&fs_info->balance_mutex);
                /* we are good with balance_ctl ripped off from under us */
-               BUG_ON(atomic_read(&fs_info->balance_running));
+               BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
                atomic_dec(&fs_info->balance_pause_req);
        } else {
                ret = -ENOTCONN;
@@ -4101,40 +4080,48 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
 
 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
 {
-       if (sb_rdonly(fs_info->sb))
-               return -EROFS;
-
        mutex_lock(&fs_info->balance_mutex);
        if (!fs_info->balance_ctl) {
                mutex_unlock(&fs_info->balance_mutex);
                return -ENOTCONN;
        }
 
+       /*
+        * A paused balance with the item stored on disk can be resumed at
+        * mount time if the mount is read-write. Otherwise it's still paused
+        * and we must not allow cancelling as it deletes the item.
+        */
+       if (sb_rdonly(fs_info->sb)) {
+               mutex_unlock(&fs_info->balance_mutex);
+               return -EROFS;
+       }
+
        atomic_inc(&fs_info->balance_cancel_req);
        /*
         * if we are running just wait and return, balance item is
         * deleted in btrfs_balance in this case
         */
-       if (atomic_read(&fs_info->balance_running)) {
+       if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
                mutex_unlock(&fs_info->balance_mutex);
                wait_event(fs_info->balance_wait_q,
-                          atomic_read(&fs_info->balance_running) == 0);
+                          !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
                mutex_lock(&fs_info->balance_mutex);
        } else {
-               /* reset_balance_state needs volume_mutex */
                mutex_unlock(&fs_info->balance_mutex);
-               mutex_lock(&fs_info->volume_mutex);
+               /*
+                * Lock released to allow other waiters to continue, we'll
+                * reexamine the status again.
+                */
                mutex_lock(&fs_info->balance_mutex);
 
                if (fs_info->balance_ctl) {
                        reset_balance_state(fs_info);
                        clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
                }
-
-               mutex_unlock(&fs_info->volume_mutex);
        }
 
-       BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+       BUG_ON(fs_info->balance_ctl ||
+               test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
        atomic_dec(&fs_info->balance_cancel_req);
        mutex_unlock(&fs_info->balance_mutex);
        return 0;
@@ -4429,7 +4416,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
        if (!path)
                return -ENOMEM;
 
-       path->reada = READA_FORWARD;
+       path->reada = READA_BACK;
 
        mutex_lock(&fs_info->chunk_mutex);
 
@@ -5990,9 +5977,8 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
        return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
 }
 
-int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
-                    u64 chunk_start, u64 physical, u64 devid,
-                    u64 **logical, int *naddrs, int *stripe_len)
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+                    u64 physical, u64 **logical, int *naddrs, int *stripe_len)
 {
        struct extent_map *em;
        struct map_lookup *map;
@@ -6024,8 +6010,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
        BUG_ON(!buf); /* -ENOMEM */
 
        for (i = 0; i < map->num_stripes; i++) {
-               if (devid && map->stripes[i].dev->devid != devid)
-                       continue;
                if (map->stripes[i].physical > physical ||
                    map->stripes[i].physical + length <= physical)
                        continue;