btrfs: scrub: introduce the main read repair worker for scrub_stripe
[sfrench/cifs-2.6.git] / fs / btrfs / scrub.c
index 876bc7e3d736e0b49cf0fe748317f477856e5dd7..3b39b87558d59337428ebe0194c5f9c351491dcb 100644 (file)
@@ -121,6 +121,7 @@ struct scrub_stripe {
 
        atomic_t pending_io;
        wait_queue_head_t io_wait;
+       wait_queue_head_t repair_wait;
 
        /*
         * Indicate the states of the stripe.  Bits are defined in
@@ -156,6 +157,8 @@ struct scrub_stripe {
         * group.
         */
        u8 *csums;
+
+       struct work_struct work;
 };
 
 struct scrub_recover {
@@ -381,6 +384,7 @@ int init_scrub_stripe(struct btrfs_fs_info *fs_info, struct scrub_stripe *stripe
        stripe->state = 0;
 
        init_waitqueue_head(&stripe->io_wait);
+       init_waitqueue_head(&stripe->repair_wait);
        atomic_set(&stripe->pending_io, 0);
 
        ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages);
@@ -403,7 +407,7 @@ error:
        return -ENOMEM;
 }
 
-void wait_scrub_stripe_io(struct scrub_stripe *stripe)
+static void wait_scrub_stripe_io(struct scrub_stripe *stripe)
 {
        wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0);
 }
@@ -2327,7 +2331,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
 }
 
 /* Verify specified sectors of a stripe. */
-void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap)
+static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap)
 {
        struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
        const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
@@ -2340,6 +2344,203 @@ void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap)
        }
 }
 
+static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec)
+{
+       int i;
+
+       for (i = 0; i < stripe->nr_sectors; i++) {
+               if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page &&
+                   scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset)
+                       break;
+       }
+       ASSERT(i < stripe->nr_sectors);
+       return i;
+}
+
+/*
+ * Repair read is different to the regular read:
+ *
+ * - Only reads the failed sectors
+ * - May have extra blocksize limits
+ */
+static void scrub_repair_read_endio(struct btrfs_bio *bbio)
+{
+       struct scrub_stripe *stripe = bbio->private;
+       struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+       struct bio_vec *bvec;
+       int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
+       u32 bio_size = 0;
+       int i;
+
+       ASSERT(sector_nr < stripe->nr_sectors);
+
+       bio_for_each_bvec_all(bvec, &bbio->bio, i)
+               bio_size += bvec->bv_len;
+
+       if (bbio->bio.bi_status) {
+               bitmap_set(&stripe->io_error_bitmap, sector_nr,
+                          bio_size >> fs_info->sectorsize_bits);
+               bitmap_set(&stripe->error_bitmap, sector_nr,
+                          bio_size >> fs_info->sectorsize_bits);
+       } else {
+               bitmap_clear(&stripe->io_error_bitmap, sector_nr,
+                            bio_size >> fs_info->sectorsize_bits);
+       }
+       bio_put(&bbio->bio);
+       if (atomic_dec_and_test(&stripe->pending_io))
+               wake_up(&stripe->io_wait);
+}
+
+static int calc_next_mirror(int mirror, int num_copies)
+{
+       ASSERT(mirror <= num_copies);
+       return (mirror + 1 > num_copies) ? 1 : mirror + 1;
+}
+
+static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
+                                           int mirror, int blocksize, bool wait)
+{
+       struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+       struct btrfs_bio *bbio = NULL;
+       const unsigned long old_error_bitmap = stripe->error_bitmap;
+       int i;
+
+       ASSERT(stripe->mirror_num >= 1);
+       ASSERT(atomic_read(&stripe->pending_io) == 0);
+
+       for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) {
+               struct page *page;
+               int pgoff;
+               int ret;
+
+               page = scrub_stripe_get_page(stripe, i);
+               pgoff = scrub_stripe_get_page_offset(stripe, i);
+
+               /* The current sector cannot be merged, submit the bio. */
+               if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) ||
+                            bbio->bio.bi_iter.bi_size >= blocksize)) {
+                       ASSERT(bbio->bio.bi_iter.bi_size);
+                       atomic_inc(&stripe->pending_io);
+                       btrfs_submit_bio(bbio, mirror);
+                       if (wait)
+                               wait_scrub_stripe_io(stripe);
+                       bbio = NULL;
+               }
+
+               if (!bbio) {
+                       bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
+                               fs_info, scrub_repair_read_endio, stripe);
+                       bbio->bio.bi_iter.bi_sector = (stripe->logical +
+                               (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT;
+               }
+
+               ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
+               ASSERT(ret == fs_info->sectorsize);
+       }
+       if (bbio) {
+               ASSERT(bbio->bio.bi_iter.bi_size);
+               atomic_inc(&stripe->pending_io);
+               btrfs_submit_bio(bbio, mirror);
+               if (wait)
+                       wait_scrub_stripe_io(stripe);
+       }
+}
+
+/*
+ * The main entrance for all read related scrub work, including:
+ *
+ * - Wait for the initial read to finish
+ * - Verify and locate any bad sectors
+ * - Go through the remaining mirrors and try to read as large blocksize as
+ *   possible
+ * - Go through all mirrors (including the failed mirror) sector-by-sector
+ *
+ * Writeback does not happen here, it needs extra synchronization.
+ */
+static void scrub_stripe_read_repair_worker(struct work_struct *work)
+{
+       struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work);
+       struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+       int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
+                                         stripe->bg->length);
+       int mirror;
+       int i;
+
+       ASSERT(stripe->mirror_num > 0);
+
+       wait_scrub_stripe_io(stripe);
+       scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap);
+       /* Save the initial failed bitmap for later repair and report usage. */
+       stripe->init_error_bitmap = stripe->error_bitmap;
+
+       if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors))
+               goto out;
+
+       /*
+        * Try all remaining mirrors.
+        *
+        * Here we still try to read as large block as possible, as this is
+        * faster and we have extra safety nets to rely on.
+        */
+       for (mirror = calc_next_mirror(stripe->mirror_num, num_copies);
+            mirror != stripe->mirror_num;
+            mirror = calc_next_mirror(mirror, num_copies)) {
+               const unsigned long old_error_bitmap = stripe->error_bitmap;
+
+               scrub_stripe_submit_repair_read(stripe, mirror,
+                                               BTRFS_STRIPE_LEN, false);
+               wait_scrub_stripe_io(stripe);
+               scrub_verify_one_stripe(stripe, old_error_bitmap);
+               if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
+                       goto out;
+       }
+
+       /*
+        * Last safety net, try re-checking all mirrors, including the failed
+        * one, sector-by-sector.
+        *
+        * As if one sector failed the drive's internal csum, the whole read
+        * containing the offending sector would be marked as error.
+        * Thus here we do sector-by-sector read.
+        *
+        * This can be slow, thus we only try it as the last resort.
+        */
+
+       for (i = 0, mirror = stripe->mirror_num;
+            i < num_copies;
+            i++, mirror = calc_next_mirror(mirror, num_copies)) {
+               const unsigned long old_error_bitmap = stripe->error_bitmap;
+
+               scrub_stripe_submit_repair_read(stripe, mirror,
+                                               fs_info->sectorsize, true);
+               wait_scrub_stripe_io(stripe);
+               scrub_verify_one_stripe(stripe, old_error_bitmap);
+               if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
+                       goto out;
+       }
+out:
+       set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state);
+       wake_up(&stripe->repair_wait);
+}
+
+void scrub_read_endio(struct btrfs_bio *bbio)
+{
+       struct scrub_stripe *stripe = bbio->private;
+
+       if (bbio->bio.bi_status) {
+               bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
+               bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
+       } else {
+               bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
+       }
+       bio_put(&bbio->bio);
+       if (atomic_dec_and_test(&stripe->pending_io)) {
+               wake_up(&stripe->io_wait);
+               INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
+               queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
+       }
+}
+
 static int scrub_checksum_tree_block(struct scrub_block *sblock)
 {
        struct scrub_ctx *sctx = sblock->sctx;