btrfs: introduce nparity raid_attr

[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index f435d397019eae589f4e9daf88c954229537a60e..29fc8a09dd2ea9622be49f4305856d46f37afe21 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -37,6 +37,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                 .tolerated_failures = 1,
                 .devs_increment = 2,
                 .ncopies        = 2,
+               .nparity        = 0,
                 .raid_name      = "raid10",
                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
                 .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
@@ -49,6 +50,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                 .tolerated_failures = 1,
                 .devs_increment = 2,
                 .ncopies        = 2,
+               .nparity        = 0,
                 .raid_name      = "raid1",
                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
                 .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
@@ -61,6 +63,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                 .tolerated_failures = 0,
                 .devs_increment = 1,
                 .ncopies        = 2,
+               .nparity        = 0,
                 .raid_name      = "dup",
                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
                 .mindev_error   = 0,
@@ -73,6 +76,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                 .tolerated_failures = 0,
                 .devs_increment = 1,
                 .ncopies        = 1,
+               .nparity        = 0,
                 .raid_name      = "raid0",
                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
                 .mindev_error   = 0,
@@ -85,6 +89,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                 .tolerated_failures = 0,
                 .devs_increment = 1,
                 .ncopies        = 1,
+               .nparity        = 0,
                 .raid_name      = "single",
                 .bg_flag        = 0,
                 .mindev_error   = 0,
@@ -96,7 +101,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                 .devs_min       = 2,
                 .tolerated_failures = 1,
                 .devs_increment = 1,
-               .ncopies        = 2,
+               .ncopies        = 1,
+               .nparity        = 1,
                 .raid_name      = "raid5",
                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
                 .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
@@ -108,7 +114,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
                 .devs_min       = 3,
                 .tolerated_failures = 2,
                 .devs_increment = 1,
-               .ncopies        = 3,
+               .ncopies        = 1,
+               .nparity        = 2,
                 .raid_name      = "raid6",
                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
                 .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
@@ -1900,6 +1907,14 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
                 goto out;
         }
  
+       if (btrfs_pinned_by_swapfile(fs_info, device)) {
+               btrfs_warn_in_rcu(fs_info,
+                 "cannot remove device %s (devid %llu) due to active swapfile",
+                                 rcu_str_deref(device->name), device->devid);
+               ret = -ETXTBSY;
+               goto out;
+       }
+
         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
                 goto out;
@@ -2718,8 +2733,15 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
         return ret;
  }
  
-static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
-                                       u64 logical, u64 length)
+/*
+ * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
+ * @logical: Logical block offset in bytes.
+ * @length: Length of extent in bytes.
+ *
+ * Return: Chunk mapping or ERR_PTR.
+ */
+struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+                                      u64 logical, u64 length)
  {
         struct extent_map_tree *em_tree;
         struct extent_map *em;
@@ -2756,7 +2778,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
         int i, ret = 0;
         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
  
-       em = get_chunk_map(fs_info, chunk_offset, 1);
+       em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
         if (IS_ERR(em)) {
                 /*
                  * This is a logic error, but we don't want to just rely on the
@@ -3638,10 +3660,15 @@ again:
  
                 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
-               if (ret && ret != -ENOSPC)
-                       goto error;
                 if (ret == -ENOSPC) {
                         enospc_errors++;
+               } else if (ret == -ETXTBSY) {
+                       btrfs_info(fs_info,
+          "skipping relocation of block group %llu due to active swapfile",
+                                  found_key.offset);
+                       ret = 0;
+               } else if (ret) {
+                       goto error;
                 } else {
                         spin_lock(&fs_info->balance_lock);
                         bctl->stat.completed++;
@@ -4433,10 +4460,16 @@ again:
  
                 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
-               if (ret && ret != -ENOSPC)
-                       goto done;
-               if (ret == -ENOSPC)
+               if (ret == -ENOSPC) {
                         failed++;
+               } else if (ret) {
+                       if (ret == -ETXTBSY) {
+                               btrfs_warn(fs_info,
+                  "could not shrink block group %llu due to active swapfile",
+                                          chunk_offset);
+                       }
+                       goto done;
+               }
         } while (key.offset-- > 0);
  
         if (failed && !retried) {
@@ -4602,11 +4635,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
         int devs_min;           /* min devs needed */
         int devs_increment;     /* ndevs has to be a multiple of this */
         int ncopies;            /* how many copies to data has */
+       int nparity;            /* number of stripes worth of bytes to
+                                  store parity information */
         int ret;
         u64 max_stripe_size;
         u64 max_chunk_size;
         u64 stripe_size;
-       u64 num_bytes;
+       u64 chunk_size;
         int ndevs;
         int i;
         int j;
@@ -4628,6 +4663,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
         devs_min = btrfs_raid_array[index].devs_min;
         devs_increment = btrfs_raid_array[index].devs_increment;
         ncopies = btrfs_raid_array[index].ncopies;
+       nparity = btrfs_raid_array[index].nparity;
  
         if (type & BTRFS_BLOCK_GROUP_DATA) {
                 max_stripe_size = SZ_1G;
@@ -4757,30 +4793,22 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
          * this will have to be fixed for RAID1 and RAID10 over
          * more drives
          */
-       data_stripes = num_stripes / ncopies;
-
-       if (type & BTRFS_BLOCK_GROUP_RAID5)
-               data_stripes = num_stripes - 1;
-
-       if (type & BTRFS_BLOCK_GROUP_RAID6)
-               data_stripes = num_stripes - 2;
+       data_stripes = (num_stripes - nparity) / ncopies;
  
         /*
          * Use the number of data stripes to figure out how big this chunk
          * is really going to be in terms of logical address space,
-        * and compare that answer with the max chunk size
+        * and compare that answer with the max chunk size. If it's higher,
+        * we try to reduce stripe_size.
          */
         if (stripe_size * data_stripes > max_chunk_size) {
-               stripe_size = div_u64(max_chunk_size, data_stripes);
-
-               /* bump the answer up to a 16MB boundary */
-               stripe_size = round_up(stripe_size, SZ_16M);
-
                 /*
-                * But don't go higher than the limits we found while searching
-                * for free extents
+                * Reduce stripe_size, round it up to a 16MB boundary again and
+                * then use it, unless it ends up being even bigger than the
+                * previous value we had already.
                  */
-               stripe_size = min(devices_info[ndevs - 1].max_avail,
+               stripe_size = min(round_up(div_u64(max_chunk_size,
+                                                  data_stripes), SZ_16M),
                                   stripe_size);
         }
  
@@ -4808,9 +4836,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
         map->type = type;
         map->sub_stripes = sub_stripes;
  
-       num_bytes = stripe_size * data_stripes;
+       chunk_size = stripe_size * data_stripes;
  
-       trace_btrfs_chunk_alloc(info, map, start, num_bytes);
+       trace_btrfs_chunk_alloc(info, map, start, chunk_size);
  
         em = alloc_extent_map();
         if (!em) {
@@ -4821,7 +4849,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
         em->map_lookup = map;
         em->start = start;
-       em->len = num_bytes;
+       em->len = chunk_size;
         em->block_start = 0;
         em->block_len = em->len;
         em->orig_block_len = stripe_size;
@@ -4839,14 +4867,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
         refcount_inc(&em->refs);
         write_unlock(&em_tree->lock);
  
-       ret = btrfs_make_block_group(trans, 0, type, start, num_bytes);
+       ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
         if (ret)
                 goto error_del_extent;
  
-       for (i = 0; i < map->num_stripes; i++) {
-               num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
-               btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
-       }
+       for (i = 0; i < map->num_stripes; i++)
+               btrfs_device_set_bytes_used(map->stripes[i].dev,
+                               map->stripes[i].dev->bytes_used + stripe_size);
  
         atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
  
@@ -4890,7 +4917,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
         int i = 0;
         int ret = 0;
  
-       em = get_chunk_map(fs_info, chunk_offset, chunk_size);
+       em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
         if (IS_ERR(em))
                 return PTR_ERR(em);
  
@@ -5032,7 +5059,7 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
         int miss_ndevs = 0;
         int i;
  
-       em = get_chunk_map(fs_info, chunk_offset, 1);
+       em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
         if (IS_ERR(em))
                 return 1;
  
@@ -5092,7 +5119,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
         struct map_lookup *map;
         int ret;
  
-       em = get_chunk_map(fs_info, logical, len);
+       em = btrfs_get_chunk_map(fs_info, logical, len);
         if (IS_ERR(em))
                 /*
                  * We could return errors for these cases, but that could get
@@ -5138,7 +5165,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
         struct map_lookup *map;
         unsigned long len = fs_info->sectorsize;
  
-       em = get_chunk_map(fs_info, logical, len);
+       em = btrfs_get_chunk_map(fs_info, logical, len);
  
         if (!WARN_ON(IS_ERR(em))) {
                 map = em->map_lookup;
@@ -5155,7 +5182,7 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
         struct map_lookup *map;
         int ret = 0;
  
-       em = get_chunk_map(fs_info, logical, len);
+       em = btrfs_get_chunk_map(fs_info, logical, len);
  
         if(!WARN_ON(IS_ERR(em))) {
                 map = em->map_lookup;
@@ -5314,7 +5341,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
         /* discard always return a bbio */
         ASSERT(bbio_ret);
  
-       em = get_chunk_map(fs_info, logical, length);
+       em = btrfs_get_chunk_map(fs_info, logical, length);
         if (IS_ERR(em))
                 return PTR_ERR(em);
  
@@ -5640,7 +5667,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                 return __btrfs_map_block_for_discard(fs_info, logical,
                                                      *length, bbio_ret);
  
-       em = get_chunk_map(fs_info, logical, *length);
+       em = btrfs_get_chunk_map(fs_info, logical, *length);
         if (IS_ERR(em))
                 return PTR_ERR(em);
  
@@ -5943,7 +5970,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
         u64 rmap_len;
         int i, j, nr = 0;
  
-       em = get_chunk_map(fs_info, chunk_start, 1);
+       em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
         if (IS_ERR(em))
                 return -EIO;
  
@@ -6083,12 +6110,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device,
         int should_queue = 1;
         struct btrfs_pending_bios *pending_bios;
  
-       if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
-           !device->bdev) {
-               bio_io_error(bio);
-               return;
-       }
-
         /* don't bother with additional async steps for reads, right now */
         if (bio_op(bio) == REQ_OP_READ) {
                 btrfsic_submit_bio(bio);
@@ -6217,7 +6238,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
  
         for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
                 dev = bbio->stripes[dev_nr].dev;
-               if (!dev || !dev->bdev ||
+               if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
+                                                  &dev->dev_state) ||
                     (bio_op(first_bio) == REQ_OP_WRITE &&
                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
                         bbio_error(bbio, first_bio, logical);
@@ -7387,6 +7409,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
         struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
         struct extent_map *em;
         struct map_lookup *map;
+       struct btrfs_device *dev;
         u64 stripe_len;
         bool found = false;
         int ret = 0;
@@ -7436,6 +7459,22 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
                         physical_offset, devid);
                 ret = -EUCLEAN;
         }
+
+       /* Make sure no dev extent is beyond device bondary */
+       dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+       if (!dev) {
+               btrfs_err(fs_info, "failed to find devid %llu", devid);
+               ret = -EUCLEAN;
+               goto out;
+       }
+       if (physical_offset + physical_len > dev->disk_total_bytes) {
+               btrfs_err(fs_info,
+"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
+                         devid, physical_offset, physical_len,
+                         dev->disk_total_bytes);
+               ret = -EUCLEAN;
+               goto out;
+       }
  out:
         free_extent_map(em);
         return ret;
@@ -7478,6 +7517,8 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
         struct btrfs_path *path;
         struct btrfs_root *root = fs_info->dev_root;
         struct btrfs_key key;
+       u64 prev_devid = 0;
+       u64 prev_dev_ext_end = 0;
         int ret = 0;
  
         key.objectid = 1;
@@ -7522,10 +7563,22 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
                 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
                 physical_len = btrfs_dev_extent_length(leaf, dext);
  
+               /* Check if this dev extent overlaps with the previous one */
+               if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
+                       btrfs_err(fs_info,
+"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
+                                 devid, physical_offset, prev_dev_ext_end);
+                       ret = -EUCLEAN;
+                       goto out;
+               }
+
                 ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
                                             physical_offset, physical_len);
                 if (ret < 0)
                         goto out;
+               prev_devid = devid;
+               prev_dev_ext_end = physical_offset + physical_len;
+
                 ret = btrfs_next_item(root, path);
                 if (ret < 0)
                         goto out;
@@ -7541,3 +7594,27 @@ out:
         btrfs_free_path(path);
         return ret;
  }
+
+/*
+ * Check whether the given block group or device is pinned by any inode being
+ * used as a swapfile.
+ */
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
+{
+       struct btrfs_swapfile_pin *sp;
+       struct rb_node *node;
+
+       spin_lock(&fs_info->swapfile_pins_lock);
+       node = fs_info->swapfile_pins.rb_node;
+       while (node) {
+               sp = rb_entry(node, struct btrfs_swapfile_pin, node);
+               if (ptr < sp->ptr)
+                       node = node->rb_left;
+               else if (ptr > sp->ptr)
+                       node = node->rb_right;
+               else
+                       break;
+       }
+       spin_unlock(&fs_info->swapfile_pins_lock);
+       return node != NULL;
+}