btrfs: alloc_chunk: fix DUP stripe size handling
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
index f85991a2585cf1c55601101cdebc9fbfba7cee58..b2d05c6b1c5672a638ba8752333c10508e366885 100644 (file)
@@ -319,7 +319,6 @@ static struct btrfs_device *__alloc_device(void)
 
        spin_lock_init(&dev->io_lock);
 
-       spin_lock_init(&dev->reada_lock);
        atomic_set(&dev->reada_in_flight, 0);
        atomic_set(&dev->dev_stats_ccnt, 0);
        btrfs_device_data_ordered_init(dev);
@@ -605,56 +604,53 @@ static void pending_bios_fn(struct btrfs_work *work)
        run_scheduled_bios(device);
 }
 
-
-static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
+/*
+ *  Search and remove all stale (devices which are not mounted) devices.
+ *  When both inputs are NULL, it will search and release all stale devices.
+ *  path:      Optional. When provided will it release all unmounted devices
+ *             matching this path only.
+ *  skip_dev:  Optional. Will skip this device when searching for the stale
+ *             devices.
+ */
+static void btrfs_free_stale_devices(const char *path,
+                                    struct btrfs_device *skip_dev)
 {
-       struct btrfs_fs_devices *fs_devs;
-       struct btrfs_device *dev;
-
-       if (!cur_dev->name)
-               return;
+       struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
+       struct btrfs_device *dev, *tmp_dev;
 
-       list_for_each_entry(fs_devs, &fs_uuids, list) {
-               int del = 1;
+       list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) {
 
                if (fs_devs->opened)
                        continue;
-               if (fs_devs->seeding)
-                       continue;
 
-               list_for_each_entry(dev, &fs_devs->devices, dev_list) {
+               list_for_each_entry_safe(dev, tmp_dev,
+                                        &fs_devs->devices, dev_list) {
+                       int not_found = 0;
 
-                       if (dev == cur_dev)
+                       if (skip_dev && skip_dev == dev)
                                continue;
-                       if (!dev->name)
+                       if (path && !dev->name)
                                continue;
 
-                       /*
-                        * Todo: This won't be enough. What if the same device
-                        * comes back (with new uuid and) with its mapper path?
-                        * But for now, this does help as mostly an admin will
-                        * either use mapper or non mapper path throughout.
-                        */
                        rcu_read_lock();
-                       del = strcmp(rcu_str_deref(dev->name),
-                                               rcu_str_deref(cur_dev->name));
+                       if (path)
+                               not_found = strcmp(rcu_str_deref(dev->name),
+                                                  path);
                        rcu_read_unlock();
-                       if (!del)
-                               break;
-               }
+                       if (not_found)
+                               continue;
 
-               if (!del) {
                        /* delete the stale device */
                        if (fs_devs->num_devices == 1) {
                                btrfs_sysfs_remove_fsid(fs_devs);
                                list_del(&fs_devs->list);
                                free_fs_devices(fs_devs);
+                               break;
                        } else {
                                fs_devs->num_devices--;
                                list_del(&dev->dev_list);
                                free_device(dev);
                        }
-                       break;
                }
        }
 }
@@ -729,25 +725,23 @@ error_brelse:
  * Add new device to list of registered devices
  *
  * Returns:
- * 1   - first time device is seen
- * 0   - device already known
- * < 0 - error
+ * device pointer which was just added or updated when successful
+ * error pointer when failed
  */
-static noinline int device_list_add(const char *path,
-                          struct btrfs_super_block *disk_super,
-                          u64 devid, struct btrfs_fs_devices **fs_devices_ret)
+static noinline struct btrfs_device *device_list_add(const char *path,
+                          struct btrfs_super_block *disk_super)
 {
        struct btrfs_device *device;
        struct btrfs_fs_devices *fs_devices;
        struct rcu_string *name;
-       int ret = 0;
        u64 found_transid = btrfs_super_generation(disk_super);
+       u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
 
        fs_devices = find_fsid(disk_super->fsid);
        if (!fs_devices) {
                fs_devices = alloc_fs_devices(disk_super->fsid);
                if (IS_ERR(fs_devices))
-                       return PTR_ERR(fs_devices);
+                       return ERR_CAST(fs_devices);
 
                list_add(&fs_devices->list, &fs_uuids);
 
@@ -759,19 +753,19 @@ static noinline int device_list_add(const char *path,
 
        if (!device) {
                if (fs_devices->opened)
-                       return -EBUSY;
+                       return ERR_PTR(-EBUSY);
 
                device = btrfs_alloc_device(NULL, &devid,
                                            disk_super->dev_item.uuid);
                if (IS_ERR(device)) {
                        /* we can safely leave the fs_devices entry around */
-                       return PTR_ERR(device);
+                       return device;
                }
 
                name = rcu_string_strdup(path, GFP_NOFS);
                if (!name) {
                        free_device(device);
-                       return -ENOMEM;
+                       return ERR_PTR(-ENOMEM);
                }
                rcu_assign_pointer(device->name, name);
 
@@ -780,8 +774,16 @@ static noinline int device_list_add(const char *path,
                fs_devices->num_devices++;
                mutex_unlock(&fs_devices->device_list_mutex);
 
-               ret = 1;
                device->fs_devices = fs_devices;
+               btrfs_free_stale_devices(path, device);
+
+               if (disk_super->label[0])
+                       pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
+                               disk_super->label, devid, found_transid, path);
+               else
+                       pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
+                               disk_super->fsid, devid, found_transid, path);
+
        } else if (!device->name || strcmp(device->name->str, path)) {
                /*
                 * When FS is already mounted.
@@ -817,12 +819,12 @@ static noinline int device_list_add(const char *path,
                         * with larger generation number or the last-in if
                         * generation are equal.
                         */
-                       return -EEXIST;
+                       return ERR_PTR(-EEXIST);
                }
 
                name = rcu_string_strdup(path, GFP_NOFS);
                if (!name)
-                       return -ENOMEM;
+                       return ERR_PTR(-ENOMEM);
                rcu_string_free(device->name);
                rcu_assign_pointer(device->name, name);
                if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
@@ -840,16 +842,9 @@ static noinline int device_list_add(const char *path,
        if (!fs_devices->opened)
                device->generation = found_transid;
 
-       /*
-        * if there is new btrfs on an already registered device,
-        * then remove the stale device entry.
-        */
-       if (ret > 0)
-               btrfs_free_stale_device(device);
-
-       *fs_devices_ret = fs_devices;
+       fs_devices->total_devices = btrfs_super_num_devices(disk_super);
 
-       return ret;
+       return device;
 }
 
 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
@@ -1184,12 +1179,10 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                          struct btrfs_fs_devices **fs_devices_ret)
 {
        struct btrfs_super_block *disk_super;
+       struct btrfs_device *device;
        struct block_device *bdev;
        struct page *page;
-       int ret = -EINVAL;
-       u64 devid;
-       u64 transid;
-       u64 total_devices;
+       int ret = 0;
        u64 bytenr;
 
        /*
@@ -1208,26 +1201,16 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                goto error;
        }
 
-       if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super))
+       if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
+               ret = -EINVAL;
                goto error_bdev_put;
-
-       devid = btrfs_stack_device_id(&disk_super->dev_item);
-       transid = btrfs_super_generation(disk_super);
-       total_devices = btrfs_super_num_devices(disk_super);
-
-       ret = device_list_add(path, disk_super, devid, fs_devices_ret);
-       if (ret > 0) {
-               if (disk_super->label[0]) {
-                       pr_info("BTRFS: device label %s ", disk_super->label);
-               } else {
-                       pr_info("BTRFS: device fsid %pU ", disk_super->fsid);
-               }
-
-               pr_cont("devid %llu transid %llu %s\n", devid, transid, path);
-               ret = 0;
        }
-       if (!ret && fs_devices_ret)
-               (*fs_devices_ret)->total_devices = total_devices;
+
+       device = device_list_add(path, disk_super);
+       if (IS_ERR(device))
+               ret = PTR_ERR(device);
+       else
+               *fs_devices_ret = device->fs_devices;
 
        btrfs_release_disk_super(page);
 
@@ -3105,6 +3088,48 @@ error:
        return ret;
 }
 
+/*
+ * return 1 : allocate a data chunk successfully,
+ * return <0: errors during allocating a data chunk,
+ * return 0 : no need to allocate a data chunk.
+ */
+static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
+                                     u64 chunk_offset)
+{
+       struct btrfs_block_group_cache *cache;
+       u64 bytes_used;
+       u64 chunk_type;
+
+       cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+       ASSERT(cache);
+       chunk_type = cache->flags;
+       btrfs_put_block_group(cache);
+
+       if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
+               spin_lock(&fs_info->data_sinfo->lock);
+               bytes_used = fs_info->data_sinfo->bytes_used;
+               spin_unlock(&fs_info->data_sinfo->lock);
+
+               if (!bytes_used) {
+                       struct btrfs_trans_handle *trans;
+                       int ret;
+
+                       trans = btrfs_join_transaction(fs_info->tree_root);
+                       if (IS_ERR(trans))
+                               return PTR_ERR(trans);
+
+                       ret = btrfs_force_chunk_alloc(trans, fs_info,
+                                                     BTRFS_BLOCK_GROUP_DATA);
+                       btrfs_end_transaction(trans);
+                       if (ret < 0)
+                               return ret;
+
+                       return 1;
+               }
+       }
+       return 0;
+}
+
 static int insert_balance_item(struct btrfs_fs_info *fs_info,
                               struct btrfs_balance_control *bctl)
 {
@@ -3563,7 +3588,6 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
        u32 count_meta = 0;
        u32 count_sys = 0;
        int chunk_reserved = 0;
-       u64 bytes_used = 0;
 
        /* step one make some room on all the devices */
        devices = &fs_info->fs_devices->devices;
@@ -3722,28 +3746,21 @@ again:
                        goto loop;
                }
 
-               ASSERT(fs_info->data_sinfo);
-               spin_lock(&fs_info->data_sinfo->lock);
-               bytes_used = fs_info->data_sinfo->bytes_used;
-               spin_unlock(&fs_info->data_sinfo->lock);
-
-               if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
-                   !chunk_reserved && !bytes_used) {
-                       trans = btrfs_start_transaction(chunk_root, 0);
-                       if (IS_ERR(trans)) {
-                               mutex_unlock(&fs_info->delete_unused_bgs_mutex);
-                               ret = PTR_ERR(trans);
-                               goto error;
-                       }
-
-                       ret = btrfs_force_chunk_alloc(trans, fs_info,
-                                                     BTRFS_BLOCK_GROUP_DATA);
-                       btrfs_end_transaction(trans);
+               if (!chunk_reserved) {
+                       /*
+                        * We may be relocating the only data chunk we have,
+                        * which could potentially end up with losing data's
+                        * raid profile, so lets allocate an empty one in
+                        * advance.
+                        */
+                       ret = btrfs_may_alloc_data_chunk(fs_info,
+                                                        found_key.offset);
                        if (ret < 0) {
                                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
                                goto error;
+                       } else if (ret == 1) {
+                               chunk_reserved = 1;
                        }
-                       chunk_reserved = 1;
                }
 
                ret = btrfs_relocate_chunk(fs_info, found_key.offset);
@@ -4506,6 +4523,18 @@ again:
                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
                btrfs_release_path(path);
 
+               /*
+                * We may be relocating the only data chunk we have,
+                * which could potentially end up with losing data's
+                * raid profile, so lets allocate an empty one in
+                * advance.
+                */
+               ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
+               if (ret < 0) {
+                       mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+                       goto done;
+               }
+
                ret = btrfs_relocate_chunk(fs_info, chunk_offset);
                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
                if (ret && ret != -ENOSPC)
@@ -4800,10 +4829,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        ndevs = min(ndevs, devs_max);
 
        /*
-        * the primary goal is to maximize the number of stripes, so use as many
-        * devices as possible, even if the stripes are not maximum sized.
+        * The primary goal is to maximize the number of stripes, so use as
+        * many devices as possible, even if the stripes are not maximum sized.
+        *
+        * The DUP profile stores more than one stripe per device, the
+        * max_avail is the total size so we have to adjust.
         */
-       stripe_size = devices_info[ndevs-1].max_avail;
+       stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
        num_stripes = ndevs * dev_stripes;
 
        /*
@@ -4838,8 +4870,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                        stripe_size = devices_info[ndevs-1].max_avail;
        }
 
-       stripe_size = div_u64(stripe_size, dev_stripes);
-
        /* align to BTRFS_STRIPE_LEN */
        stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
 
@@ -5167,7 +5197,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
        else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
                ret = 2;
        else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
-               ret = 3;
+               /*
+                * There could be two corrupted data stripes, we need
+                * to loop retry in order to rebuild the correct data.
+                * 
+                * Fail a stripe at a time on every retry except the
+                * stripe under reconstruction.
+                */
+               ret = map->num_stripes;
        else
                ret = 1;
        free_extent_map(em);