btrfs: introduce nparity raid_attr
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
index f435d397019eae589f4e9daf88c954229537a60e..29fc8a09dd2ea9622be49f4305856d46f37afe21 100644 (file)
@@ -37,6 +37,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 1,
                .devs_increment = 2,
                .ncopies        = 2,
+               .nparity        = 0,
                .raid_name      = "raid10",
                .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
                .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
@@ -49,6 +50,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 1,
                .devs_increment = 2,
                .ncopies        = 2,
+               .nparity        = 0,
                .raid_name      = "raid1",
                .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
                .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
@@ -61,6 +63,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 0,
                .devs_increment = 1,
                .ncopies        = 2,
+               .nparity        = 0,
                .raid_name      = "dup",
                .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
                .mindev_error   = 0,
@@ -73,6 +76,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 0,
                .devs_increment = 1,
                .ncopies        = 1,
+               .nparity        = 0,
                .raid_name      = "raid0",
                .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
                .mindev_error   = 0,
@@ -85,6 +89,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .tolerated_failures = 0,
                .devs_increment = 1,
                .ncopies        = 1,
+               .nparity        = 0,
                .raid_name      = "single",
                .bg_flag        = 0,
                .mindev_error   = 0,
@@ -96,7 +101,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .devs_min       = 2,
                .tolerated_failures = 1,
                .devs_increment = 1,
-               .ncopies        = 2,
+               .ncopies        = 1,
+               .nparity        = 1,
                .raid_name      = "raid5",
                .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
                .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
@@ -108,7 +114,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                .devs_min       = 3,
                .tolerated_failures = 2,
                .devs_increment = 1,
-               .ncopies        = 3,
+               .ncopies        = 1,
+               .nparity        = 2,
                .raid_name      = "raid6",
                .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
                .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
@@ -1900,6 +1907,14 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
                goto out;
        }
 
+       if (btrfs_pinned_by_swapfile(fs_info, device)) {
+               btrfs_warn_in_rcu(fs_info,
+                 "cannot remove device %s (devid %llu) due to active swapfile",
+                                 rcu_str_deref(device->name), device->devid);
+               ret = -ETXTBSY;
+               goto out;
+       }
+
        if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
                ret = BTRFS_ERROR_DEV_TGT_REPLACE;
                goto out;
@@ -2718,8 +2733,15 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
        return ret;
 }
 
-static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
-                                       u64 logical, u64 length)
+/*
+ * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
+ * @logical: Logical block offset in bytes.
+ * @length: Length of extent in bytes.
+ *
+ * Return: Chunk mapping or ERR_PTR.
+ */
+struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+                                      u64 logical, u64 length)
 {
        struct extent_map_tree *em_tree;
        struct extent_map *em;
@@ -2756,7 +2778,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
        int i, ret = 0;
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 
-       em = get_chunk_map(fs_info, chunk_offset, 1);
+       em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
        if (IS_ERR(em)) {
                /*
                 * This is a logic error, but we don't want to just rely on the
@@ -3638,10 +3660,15 @@ again:
 
                ret = btrfs_relocate_chunk(fs_info, found_key.offset);
                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
-               if (ret && ret != -ENOSPC)
-                       goto error;
                if (ret == -ENOSPC) {
                        enospc_errors++;
+               } else if (ret == -ETXTBSY) {
+                       btrfs_info(fs_info,
+          "skipping relocation of block group %llu due to active swapfile",
+                                  found_key.offset);
+                       ret = 0;
+               } else if (ret) {
+                       goto error;
                } else {
                        spin_lock(&fs_info->balance_lock);
                        bctl->stat.completed++;
@@ -4433,10 +4460,16 @@ again:
 
                ret = btrfs_relocate_chunk(fs_info, chunk_offset);
                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
-               if (ret && ret != -ENOSPC)
-                       goto done;
-               if (ret == -ENOSPC)
+               if (ret == -ENOSPC) {
                        failed++;
+               } else if (ret) {
+                       if (ret == -ETXTBSY) {
+                               btrfs_warn(fs_info,
+                  "could not shrink block group %llu due to active swapfile",
+                                          chunk_offset);
+                       }
+                       goto done;
+               }
        } while (key.offset-- > 0);
 
        if (failed && !retried) {
@@ -4602,11 +4635,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        int devs_min;           /* min devs needed */
        int devs_increment;     /* ndevs has to be a multiple of this */
        int ncopies;            /* how many copies to data has */
+       int nparity;            /* number of stripes worth of bytes to
+                                  store parity information */
        int ret;
        u64 max_stripe_size;
        u64 max_chunk_size;
        u64 stripe_size;
-       u64 num_bytes;
+       u64 chunk_size;
        int ndevs;
        int i;
        int j;
@@ -4628,6 +4663,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        devs_min = btrfs_raid_array[index].devs_min;
        devs_increment = btrfs_raid_array[index].devs_increment;
        ncopies = btrfs_raid_array[index].ncopies;
+       nparity = btrfs_raid_array[index].nparity;
 
        if (type & BTRFS_BLOCK_GROUP_DATA) {
                max_stripe_size = SZ_1G;
@@ -4757,30 +4793,22 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
         * this will have to be fixed for RAID1 and RAID10 over
         * more drives
         */
-       data_stripes = num_stripes / ncopies;
-
-       if (type & BTRFS_BLOCK_GROUP_RAID5)
-               data_stripes = num_stripes - 1;
-
-       if (type & BTRFS_BLOCK_GROUP_RAID6)
-               data_stripes = num_stripes - 2;
+       data_stripes = (num_stripes - nparity) / ncopies;
 
        /*
         * Use the number of data stripes to figure out how big this chunk
         * is really going to be in terms of logical address space,
-        * and compare that answer with the max chunk size
+        * and compare that answer with the max chunk size. If it's higher,
+        * we try to reduce stripe_size.
         */
        if (stripe_size * data_stripes > max_chunk_size) {
-               stripe_size = div_u64(max_chunk_size, data_stripes);
-
-               /* bump the answer up to a 16MB boundary */
-               stripe_size = round_up(stripe_size, SZ_16M);
-
                /*
-                * But don't go higher than the limits we found while searching
-                * for free extents
+                * Reduce stripe_size, round it up to a 16MB boundary again and
+                * then use it, unless it ends up being even bigger than the
+                * previous value we had already.
                 */
-               stripe_size = min(devices_info[ndevs - 1].max_avail,
+               stripe_size = min(round_up(div_u64(max_chunk_size,
+                                                  data_stripes), SZ_16M),
                                  stripe_size);
        }
 
@@ -4808,9 +4836,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        map->type = type;
        map->sub_stripes = sub_stripes;
 
-       num_bytes = stripe_size * data_stripes;
+       chunk_size = stripe_size * data_stripes;
 
-       trace_btrfs_chunk_alloc(info, map, start, num_bytes);
+       trace_btrfs_chunk_alloc(info, map, start, chunk_size);
 
        em = alloc_extent_map();
        if (!em) {
@@ -4821,7 +4849,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
        em->map_lookup = map;
        em->start = start;
-       em->len = num_bytes;
+       em->len = chunk_size;
        em->block_start = 0;
        em->block_len = em->len;
        em->orig_block_len = stripe_size;
@@ -4839,14 +4867,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        refcount_inc(&em->refs);
        write_unlock(&em_tree->lock);
 
-       ret = btrfs_make_block_group(trans, 0, type, start, num_bytes);
+       ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
        if (ret)
                goto error_del_extent;
 
-       for (i = 0; i < map->num_stripes; i++) {
-               num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
-               btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
-       }
+       for (i = 0; i < map->num_stripes; i++)
+               btrfs_device_set_bytes_used(map->stripes[i].dev,
+                               map->stripes[i].dev->bytes_used + stripe_size);
 
        atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
 
@@ -4890,7 +4917,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
        int i = 0;
        int ret = 0;
 
-       em = get_chunk_map(fs_info, chunk_offset, chunk_size);
+       em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
        if (IS_ERR(em))
                return PTR_ERR(em);
 
@@ -5032,7 +5059,7 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
        int miss_ndevs = 0;
        int i;
 
-       em = get_chunk_map(fs_info, chunk_offset, 1);
+       em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
        if (IS_ERR(em))
                return 1;
 
@@ -5092,7 +5119,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
        struct map_lookup *map;
        int ret;
 
-       em = get_chunk_map(fs_info, logical, len);
+       em = btrfs_get_chunk_map(fs_info, logical, len);
        if (IS_ERR(em))
                /*
                 * We could return errors for these cases, but that could get
@@ -5138,7 +5165,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
        struct map_lookup *map;
        unsigned long len = fs_info->sectorsize;
 
-       em = get_chunk_map(fs_info, logical, len);
+       em = btrfs_get_chunk_map(fs_info, logical, len);
 
        if (!WARN_ON(IS_ERR(em))) {
                map = em->map_lookup;
@@ -5155,7 +5182,7 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
        struct map_lookup *map;
        int ret = 0;
 
-       em = get_chunk_map(fs_info, logical, len);
+       em = btrfs_get_chunk_map(fs_info, logical, len);
 
        if(!WARN_ON(IS_ERR(em))) {
                map = em->map_lookup;
@@ -5314,7 +5341,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
        /* discard always return a bbio */
        ASSERT(bbio_ret);
 
-       em = get_chunk_map(fs_info, logical, length);
+       em = btrfs_get_chunk_map(fs_info, logical, length);
        if (IS_ERR(em))
                return PTR_ERR(em);
 
@@ -5640,7 +5667,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                return __btrfs_map_block_for_discard(fs_info, logical,
                                                     *length, bbio_ret);
 
-       em = get_chunk_map(fs_info, logical, *length);
+       em = btrfs_get_chunk_map(fs_info, logical, *length);
        if (IS_ERR(em))
                return PTR_ERR(em);
 
@@ -5943,7 +5970,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
        u64 rmap_len;
        int i, j, nr = 0;
 
-       em = get_chunk_map(fs_info, chunk_start, 1);
+       em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
        if (IS_ERR(em))
                return -EIO;
 
@@ -6083,12 +6110,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device,
        int should_queue = 1;
        struct btrfs_pending_bios *pending_bios;
 
-       if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
-           !device->bdev) {
-               bio_io_error(bio);
-               return;
-       }
-
        /* don't bother with additional async steps for reads, right now */
        if (bio_op(bio) == REQ_OP_READ) {
                btrfsic_submit_bio(bio);
@@ -6217,7 +6238,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 
        for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
                dev = bbio->stripes[dev_nr].dev;
-               if (!dev || !dev->bdev ||
+               if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
+                                                  &dev->dev_state) ||
                    (bio_op(first_bio) == REQ_OP_WRITE &&
                    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
                        bbio_error(bbio, first_bio, logical);
@@ -7387,6 +7409,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
        struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
        struct extent_map *em;
        struct map_lookup *map;
+       struct btrfs_device *dev;
        u64 stripe_len;
        bool found = false;
        int ret = 0;
@@ -7436,6 +7459,22 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
                        physical_offset, devid);
                ret = -EUCLEAN;
        }
+
+       /* Make sure no dev extent is beyond device bondary */
+       dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+       if (!dev) {
+               btrfs_err(fs_info, "failed to find devid %llu", devid);
+               ret = -EUCLEAN;
+               goto out;
+       }
+       if (physical_offset + physical_len > dev->disk_total_bytes) {
+               btrfs_err(fs_info,
+"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
+                         devid, physical_offset, physical_len,
+                         dev->disk_total_bytes);
+               ret = -EUCLEAN;
+               goto out;
+       }
 out:
        free_extent_map(em);
        return ret;
@@ -7478,6 +7517,8 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
        struct btrfs_path *path;
        struct btrfs_root *root = fs_info->dev_root;
        struct btrfs_key key;
+       u64 prev_devid = 0;
+       u64 prev_dev_ext_end = 0;
        int ret = 0;
 
        key.objectid = 1;
@@ -7522,10 +7563,22 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
                chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
                physical_len = btrfs_dev_extent_length(leaf, dext);
 
+               /* Check if this dev extent overlaps with the previous one */
+               if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
+                       btrfs_err(fs_info,
+"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
+                                 devid, physical_offset, prev_dev_ext_end);
+                       ret = -EUCLEAN;
+                       goto out;
+               }
+
                ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
                                            physical_offset, physical_len);
                if (ret < 0)
                        goto out;
+               prev_devid = devid;
+               prev_dev_ext_end = physical_offset + physical_len;
+
                ret = btrfs_next_item(root, path);
                if (ret < 0)
                        goto out;
@@ -7541,3 +7594,27 @@ out:
        btrfs_free_path(path);
        return ret;
 }
+
+/*
+ * Check whether the given block group or device is pinned by any inode being
+ * used as a swapfile.
+ */
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
+{
+       struct btrfs_swapfile_pin *sp;
+       struct rb_node *node;
+
+       spin_lock(&fs_info->swapfile_pins_lock);
+       node = fs_info->swapfile_pins.rb_node;
+       while (node) {
+               sp = rb_entry(node, struct btrfs_swapfile_pin, node);
+               if (ptr < sp->ptr)
+                       node = node->rb_left;
+               else if (ptr > sp->ptr)
+                       node = node->rb_right;
+               else
+                       break;
+       }
+       spin_unlock(&fs_info->swapfile_pins_lock);
+       return node != NULL;
+}