btrfs: use mutex in btrfs_resume_balance_async
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
index 70a87d4fe5fe5e2676c8ce5f43cfab71d2e70a4d..447a0c275b43f5a43d61d15f2f838bd8d53cae67 100644 (file)
@@ -167,12 +167,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
  * may be used to exclude some operations from running concurrently without any
  * modifications to the list (see write_all_supers)
  *
- * volume_mutex
- * ------------
- * coarse lock owned by a mounted filesystem; used to exclude some operations
- * that cannot run in parallel and affect the higher-level properties of the
- * filesystem like: device add/deleting/resize/replace, or balance
- *
  * balance_mutex
  * -------------
  * protects balance structures (status, state) and context accessed from
@@ -269,7 +263,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
        return fs_devs;
 }
 
-static void free_device(struct btrfs_device *device)
+void btrfs_free_device(struct btrfs_device *device)
 {
        rcu_string_free(device->name);
        bio_put(device->flush_bio);
@@ -284,7 +278,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
                device = list_entry(fs_devices->devices.next,
                                    struct btrfs_device, dev_list);
                list_del(&device->dev_list);
-               free_device(device);
+               btrfs_free_device(device);
        }
        kfree(fs_devices);
 }
@@ -317,7 +311,7 @@ void __exit btrfs_cleanup_fs_uuids(void)
 /*
  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
  * Returned struct is not linked onto any lists and must be destroyed using
- * free_device.
+ * btrfs_free_device.
  */
 static struct btrfs_device *__alloc_device(void)
 {
@@ -672,7 +666,7 @@ static void btrfs_free_stale_devices(const char *path,
                        } else {
                                fs_devs->num_devices--;
                                list_del(&dev->dev_list);
-                               free_device(dev);
+                               btrfs_free_device(dev);
                        }
                }
        }
@@ -787,7 +781,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 
                name = rcu_string_strdup(path, GFP_NOFS);
                if (!name) {
-                       free_device(device);
+                       btrfs_free_device(device);
                        return ERR_PTR(-ENOMEM);
                }
                rcu_assign_pointer(device->name, name);
@@ -900,7 +894,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
                        name = rcu_string_strdup(orig_dev->name->str,
                                        GFP_KERNEL);
                        if (!name) {
-                               free_device(device);
+                               btrfs_free_device(device);
                                goto error;
                        }
                        rcu_assign_pointer(device->name, name);
@@ -972,7 +966,7 @@ again:
                }
                list_del_init(&device->dev_list);
                fs_devices->num_devices--;
-               free_device(device);
+               btrfs_free_device(device);
        }
 
        if (fs_devices->seed) {
@@ -990,7 +984,7 @@ static void free_device_rcu(struct rcu_head *head)
        struct btrfs_device *device;
 
        device = container_of(head, struct btrfs_device, rcu);
-       free_device(device);
+       btrfs_free_device(device);
 }
 
 static void btrfs_close_bdev(struct btrfs_device *device)
@@ -1954,7 +1948,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
        u64 num_devices;
        int ret = 0;
 
-       mutex_lock(&fs_info->volume_mutex);
        mutex_lock(&uuid_mutex);
 
        num_devices = fs_devices->num_devices;
@@ -2068,7 +2061,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 
 out:
        mutex_unlock(&uuid_mutex);
-       mutex_unlock(&fs_info->volume_mutex);
        return ret;
 
 error_undo:
@@ -2220,10 +2212,6 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
                struct btrfs_device *tmp;
 
                devices = &fs_info->fs_devices->devices;
-               /*
-                * It is safe to read the devices since the volume_mutex
-                * is held by the caller.
-                */
                list_for_each_entry(tmp, devices, dev_list) {
                        if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
                                        &tmp->dev_state) && !tmp->bdev) {
@@ -2602,7 +2590,7 @@ error_trans:
        if (trans)
                btrfs_end_transaction(trans);
 error_free_device:
-       free_device(device);
+       btrfs_free_device(device);
 error:
        blkdev_put(bdev, FMODE_EXCL);
        if (seeding_dev && !unlocked) {
@@ -2612,99 +2600,6 @@ error:
        return ret;
 }
 
-int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
-                                 const char *device_path,
-                                 struct btrfs_device *srcdev,
-                                 struct btrfs_device **device_out)
-{
-       struct btrfs_device *device;
-       struct block_device *bdev;
-       struct list_head *devices;
-       struct rcu_string *name;
-       u64 devid = BTRFS_DEV_REPLACE_DEVID;
-       int ret = 0;
-
-       *device_out = NULL;
-       if (fs_info->fs_devices->seeding) {
-               btrfs_err(fs_info, "the filesystem is a seed filesystem!");
-               return -EINVAL;
-       }
-
-       bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
-                                 fs_info->bdev_holder);
-       if (IS_ERR(bdev)) {
-               btrfs_err(fs_info, "target device %s is invalid!", device_path);
-               return PTR_ERR(bdev);
-       }
-
-       filemap_write_and_wait(bdev->bd_inode->i_mapping);
-
-       devices = &fs_info->fs_devices->devices;
-       list_for_each_entry(device, devices, dev_list) {
-               if (device->bdev == bdev) {
-                       btrfs_err(fs_info,
-                                 "target device is in the filesystem!");
-                       ret = -EEXIST;
-                       goto error;
-               }
-       }
-
-
-       if (i_size_read(bdev->bd_inode) <
-           btrfs_device_get_total_bytes(srcdev)) {
-               btrfs_err(fs_info,
-                         "target device is smaller than source device!");
-               ret = -EINVAL;
-               goto error;
-       }
-
-
-       device = btrfs_alloc_device(NULL, &devid, NULL);
-       if (IS_ERR(device)) {
-               ret = PTR_ERR(device);
-               goto error;
-       }
-
-       name = rcu_string_strdup(device_path, GFP_KERNEL);
-       if (!name) {
-               free_device(device);
-               ret = -ENOMEM;
-               goto error;
-       }
-       rcu_assign_pointer(device->name, name);
-
-       mutex_lock(&fs_info->fs_devices->device_list_mutex);
-       set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
-       device->generation = 0;
-       device->io_width = fs_info->sectorsize;
-       device->io_align = fs_info->sectorsize;
-       device->sector_size = fs_info->sectorsize;
-       device->total_bytes = btrfs_device_get_total_bytes(srcdev);
-       device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
-       device->bytes_used = btrfs_device_get_bytes_used(srcdev);
-       device->commit_total_bytes = srcdev->commit_total_bytes;
-       device->commit_bytes_used = device->bytes_used;
-       device->fs_info = fs_info;
-       device->bdev = bdev;
-       set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
-       set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
-       device->mode = FMODE_EXCL;
-       device->dev_stats_valid = 1;
-       set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
-       device->fs_devices = fs_info->fs_devices;
-       list_add(&device->dev_list, &fs_info->fs_devices->devices);
-       fs_info->fs_devices->num_devices++;
-       fs_info->fs_devices->open_devices++;
-       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-
-       *device_out = device;
-       return ret;
-
-error:
-       blkdev_put(bdev, FMODE_EXCL);
-       return ret;
-}
-
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
                                        struct btrfs_device *device)
 {
@@ -3305,9 +3200,8 @@ static void update_balance_args(struct btrfs_balance_control *bctl)
 }
 
 /*
- * Should be called with both balance and volume mutexes held to
- * serialize other volume operations (add_dev/rm_dev/resize) with
- * restriper.  Same goes for unset_balance_control.
+ * Should be called with balance mutex held to protect against checking the
+ * balance status or progress. Same goes for reset_balance_state.
  */
 static void set_balance_control(struct btrfs_balance_control *bctl)
 {
@@ -3320,9 +3214,13 @@ static void set_balance_control(struct btrfs_balance_control *bctl)
        spin_unlock(&fs_info->balance_lock);
 }
 
-static void unset_balance_control(struct btrfs_fs_info *fs_info)
+/*
+ * Clear the balance status in fs_info and delete the balance item from disk.
+ */
+static void reset_balance_state(struct btrfs_fs_info *fs_info)
 {
        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+       int ret;
 
        BUG_ON(!fs_info->balance_ctl);
 
@@ -3331,6 +3229,9 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info)
        spin_unlock(&fs_info->balance_lock);
 
        kfree(bctl);
+       ret = del_balance_item(fs_info);
+       if (ret)
+               btrfs_handle_fs_error(fs_info, ret, NULL);
 }
 
 /*
@@ -3867,18 +3768,6 @@ static inline int balance_need_close(struct btrfs_fs_info *fs_info)
                 atomic_read(&fs_info->balance_cancel_req) == 0);
 }
 
-static void __cancel_balance(struct btrfs_fs_info *fs_info)
-{
-       int ret;
-
-       unset_balance_control(fs_info);
-       ret = del_balance_item(fs_info);
-       if (ret)
-               btrfs_handle_fs_error(fs_info, ret, NULL);
-
-       clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
-}
-
 /* Non-zero return value signifies invalidity */
 static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
                u64 allowed)
@@ -3889,7 +3778,7 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
 }
 
 /*
- * Should be called with both balance and volume mutexes held
+ * Should be called with balance mutexe held
  */
 int btrfs_balance(struct btrfs_balance_control *bctl,
                  struct btrfs_ioctl_balance_args *bargs)
@@ -4018,22 +3907,24 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                spin_unlock(&fs_info->balance_lock);
        }
 
-       atomic_inc(&fs_info->balance_running);
+       ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
+       set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
        mutex_unlock(&fs_info->balance_mutex);
 
        ret = __btrfs_balance(fs_info);
 
        mutex_lock(&fs_info->balance_mutex);
-       atomic_dec(&fs_info->balance_running);
+       clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
 
        if (bargs) {
                memset(bargs, 0, sizeof(*bargs));
-               update_ioctl_balance_args(fs_info, 0, bargs);
+               btrfs_update_ioctl_balance_args(fs_info, bargs);
        }
 
        if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
            balance_need_close(fs_info)) {
-               __cancel_balance(fs_info);
+               reset_balance_state(fs_info);
+               clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
        }
 
        wake_up(&fs_info->balance_wait_q);
@@ -4041,11 +3932,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        return ret;
 out:
        if (bctl->flags & BTRFS_BALANCE_RESUME)
-               __cancel_balance(fs_info);
-       else {
+               reset_balance_state(fs_info);
+       else
                kfree(bctl);
-               clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
-       }
+       clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+
        return ret;
 }
 
@@ -4054,16 +3945,12 @@ static int balance_kthread(void *data)
        struct btrfs_fs_info *fs_info = data;
        int ret = 0;
 
-       mutex_lock(&fs_info->volume_mutex);
        mutex_lock(&fs_info->balance_mutex);
-
        if (fs_info->balance_ctl) {
                btrfs_info(fs_info, "continuing balance");
                ret = btrfs_balance(fs_info->balance_ctl, NULL);
        }
-
        mutex_unlock(&fs_info->balance_mutex);
-       mutex_unlock(&fs_info->volume_mutex);
 
        return ret;
 }
@@ -4072,12 +3959,12 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
 {
        struct task_struct *tsk;
 
-       spin_lock(&fs_info->balance_lock);
+       mutex_lock(&fs_info->balance_mutex);
        if (!fs_info->balance_ctl) {
-               spin_unlock(&fs_info->balance_lock);
+               mutex_unlock(&fs_info->balance_mutex);
                return 0;
        }
-       spin_unlock(&fs_info->balance_lock);
+       mutex_unlock(&fs_info->balance_mutex);
 
        if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
                btrfs_info(fs_info, "force skipping balance");
@@ -4143,15 +4030,23 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
        btrfs_balance_sys(leaf, item, &disk_bargs);
        btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
 
-       WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
+       /*
+        * This should never happen, as the paused balance state is recovered
+        * during mount without any chance of other exclusive ops to collide.
+        *
+        * This gives the exclusive op status to balance and keeps in paused
+        * state until user intervention (cancel or umount). If the ownership
+        * cannot be assigned, show a message but do not fail. The balance
+        * is in a paused state and must have fs_info::balance_ctl properly
+        * set up.
+        */
+       if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+               btrfs_warn(fs_info,
+       "cannot set exclusive op status to balance, resume manually");
 
-       mutex_lock(&fs_info->volume_mutex);
        mutex_lock(&fs_info->balance_mutex);
-
        set_balance_control(bctl);
-
        mutex_unlock(&fs_info->balance_mutex);
-       mutex_unlock(&fs_info->volume_mutex);
 out:
        btrfs_free_path(path);
        return ret;
@@ -4167,16 +4062,16 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
                return -ENOTCONN;
        }
 
-       if (atomic_read(&fs_info->balance_running)) {
+       if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
                atomic_inc(&fs_info->balance_pause_req);
                mutex_unlock(&fs_info->balance_mutex);
 
                wait_event(fs_info->balance_wait_q,
-                          atomic_read(&fs_info->balance_running) == 0);
+                          !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
 
                mutex_lock(&fs_info->balance_mutex);
                /* we are good with balance_ctl ripped off from under us */
-               BUG_ON(atomic_read(&fs_info->balance_running));
+               BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
                atomic_dec(&fs_info->balance_pause_req);
        } else {
                ret = -ENOTCONN;
@@ -4188,38 +4083,48 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
 
 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
 {
-       if (sb_rdonly(fs_info->sb))
-               return -EROFS;
-
        mutex_lock(&fs_info->balance_mutex);
        if (!fs_info->balance_ctl) {
                mutex_unlock(&fs_info->balance_mutex);
                return -ENOTCONN;
        }
 
+       /*
+        * A paused balance with the item stored on disk can be resumed at
+        * mount time if the mount is read-write. Otherwise it's still paused
+        * and we must not allow cancelling as it deletes the item.
+        */
+       if (sb_rdonly(fs_info->sb)) {
+               mutex_unlock(&fs_info->balance_mutex);
+               return -EROFS;
+       }
+
        atomic_inc(&fs_info->balance_cancel_req);
        /*
         * if we are running just wait and return, balance item is
         * deleted in btrfs_balance in this case
         */
-       if (atomic_read(&fs_info->balance_running)) {
+       if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
                mutex_unlock(&fs_info->balance_mutex);
                wait_event(fs_info->balance_wait_q,
-                          atomic_read(&fs_info->balance_running) == 0);
+                          !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
                mutex_lock(&fs_info->balance_mutex);
        } else {
-               /* __cancel_balance needs volume_mutex */
                mutex_unlock(&fs_info->balance_mutex);
-               mutex_lock(&fs_info->volume_mutex);
+               /*
+                * Lock released to allow other waiters to continue, we'll
+                * reexamine the status again.
+                */
                mutex_lock(&fs_info->balance_mutex);
 
-               if (fs_info->balance_ctl)
-                       __cancel_balance(fs_info);
-
-               mutex_unlock(&fs_info->volume_mutex);
+               if (fs_info->balance_ctl) {
+                       reset_balance_state(fs_info);
+                       clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+               }
        }
 
-       BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+       BUG_ON(fs_info->balance_ctl ||
+               test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
        atomic_dec(&fs_info->balance_cancel_req);
        mutex_unlock(&fs_info->balance_mutex);
        return 0;
@@ -6442,7 +6347,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
  *
  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
  * on error.  Returned struct is not linked onto any lists and must be
- * destroyed with free_device.
+ * destroyed with btrfs_free_device.
  */
 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
                                        const u64 *devid,
@@ -6465,7 +6370,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 
                ret = find_next_devid(fs_info, &tmp);
                if (ret) {
-                       free_device(dev);
+                       btrfs_free_device(dev);
                        return ERR_PTR(ret);
                }
        }