Merge tag 'zonefs-6.9-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal...
[sfrench/cifs-2.6.git] / drivers / md / raid10.c
index a5f8419e2df1d5624f587a3615c3e46348532701..a4556d2e46bf95f3bb8020941a00119c050cf662 100644 (file)
@@ -76,9 +76,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
 static void end_reshape_write(struct bio *bio);
 static void end_reshape(struct r10conf *conf);
 
-#define raid10_log(md, fmt, args...)                           \
-       do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
-
 #include "raid1-10.c"
 
 #define NULL_CMD
@@ -518,11 +515,7 @@ static void raid10_end_write_request(struct bio *bio)
                 * The 'master' represents the composite IO operation to
                 * user-side. So if something waits for IO, then it will
                 * wait for the 'master' bio.
-                */
-               sector_t first_bad;
-               int bad_sectors;
-
-               /*
+                *
                 * Do not set R10BIO_Uptodate if the current device is
                 * rebuilding or Faulty. This is because we cannot use
                 * such device for properly reading the data back (we could
@@ -535,10 +528,9 @@ static void raid10_end_write_request(struct bio *bio)
                        set_bit(R10BIO_Uptodate, &r10_bio->state);
 
                /* Maybe we can clear some bad blocks. */
-               if (is_badblock(rdev,
-                               r10_bio->devs[slot].addr,
-                               r10_bio->sectors,
-                               &first_bad, &bad_sectors) && !discard_error) {
+               if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
+                                     r10_bio->sectors) &&
+                   !discard_error) {
                        bio_put(bio);
                        if (repl)
                                r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
@@ -753,17 +745,8 @@ static struct md_rdev *read_balance(struct r10conf *conf,
        best_good_sectors = 0;
        do_balance = 1;
        clear_bit(R10BIO_FailFast, &r10_bio->state);
-       /*
-        * Check if we can balance. We can balance on the whole
-        * device if no resync is going on (recovery is ok), or below
-        * the resync window. We take the first readable disk when
-        * above the resync window.
-        */
-       if ((conf->mddev->recovery_cp < MaxSector
-            && (this_sector + sectors >= conf->next_resync)) ||
-           (mddev_is_clustered(conf->mddev) &&
-            md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
-                                           this_sector + sectors)))
+
+       if (raid1_should_read_first(conf->mddev, this_sector, sectors))
                do_balance = 0;
 
        for (slot = 0; slot < conf->copies ; slot++) {
@@ -1033,7 +1016,7 @@ static bool wait_barrier(struct r10conf *conf, bool nowait)
                        ret = false;
                } else {
                        conf->nr_waiting++;
-                       raid10_log(conf->mddev, "wait barrier");
+                       mddev_add_trace_msg(conf->mddev, "raid10 wait barrier");
                        wait_event_barrier(conf, stop_waiting_barrier(conf));
                        conf->nr_waiting--;
                }
@@ -1152,7 +1135,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
                        bio_wouldblock_error(bio);
                        return false;
                }
-               raid10_log(conf->mddev, "wait reshape");
+               mddev_add_trace_msg(conf->mddev, "raid10 wait reshape");
                wait_event(conf->wait_barrier,
                           conf->reshape_progress <= bio->bi_iter.bi_sector ||
                           conf->reshape_progress >= bio->bi_iter.bi_sector +
@@ -1249,10 +1232,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
            test_bit(R10BIO_FailFast, &r10_bio->state))
                read_bio->bi_opf |= MD_FAILFAST;
        read_bio->bi_private = r10_bio;
-
-       if (mddev->gendisk)
-               trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
-                                     r10_bio->sector);
+       mddev_trace_remap(mddev, read_bio, r10_bio->sector);
        submit_bio_noacct(read_bio);
        return;
 }
@@ -1288,10 +1268,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
                         && enough(conf, devnum))
                mbio->bi_opf |= MD_FAILFAST;
        mbio->bi_private = r10_bio;
-
-       if (conf->mddev->gendisk)
-               trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk),
-                                     r10_bio->sector);
+       mddev_trace_remap(mddev, mbio, r10_bio->sector);
        /* flush_pending_writes() needs access to the rdev so...*/
        mbio->bi_bdev = (void *)rdev;
 
@@ -1330,10 +1307,7 @@ retry_wait:
                }
 
                if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
-                       sector_t first_bad;
                        sector_t dev_sector = r10_bio->devs[i].addr;
-                       int bad_sectors;
-                       int is_bad;
 
                        /*
                         * Discard request doesn't care the write result
@@ -1342,9 +1316,8 @@ retry_wait:
                        if (!r10_bio->sectors)
                                continue;
 
-                       is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors,
-                                            &first_bad, &bad_sectors);
-                       if (is_bad < 0) {
+                       if (rdev_has_badblock(rdev, dev_sector,
+                                             r10_bio->sectors) < 0) {
                                /*
                                 * Mustn't write here until the bad block
                                 * is acknowledged
@@ -1360,8 +1333,9 @@ retry_wait:
        if (unlikely(blocked_rdev)) {
                /* Have to wait for this device to get unblocked, then retry */
                allow_barrier(conf);
-               raid10_log(conf->mddev, "%s wait rdev %d blocked",
-                               __func__, blocked_rdev->raid_disk);
+               mddev_add_trace_msg(conf->mddev,
+                       "raid10 %s wait rdev %d blocked",
+                       __func__, blocked_rdev->raid_disk);
                md_wait_for_blocked_rdev(blocked_rdev, mddev);
                wait_barrier(conf, false);
                goto retry_wait;
@@ -1416,7 +1390,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
                        bio_wouldblock_error(bio);
                        return;
                }
-               raid10_log(conf->mddev, "wait reshape metadata");
+               mddev_add_trace_msg(conf->mddev,
+                       "raid10 wait reshape metadata");
                wait_event(mddev->sb_wait,
                           !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
 
@@ -2131,10 +2106,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
                        continue;
                }
 
-               if (mddev->gendisk)
-                       disk_stack_limits(mddev->gendisk, rdev->bdev,
-                                         rdev->data_offset << 9);
-
+               err = mddev_stack_new_rdev(mddev, rdev);
+               if (err)
+                       return err;
                p->head_position = 0;
                p->recovery_disabled = mddev->recovery_disabled - 1;
                rdev->raid_disk = mirror;
@@ -2150,10 +2124,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
                clear_bit(In_sync, &rdev->flags);
                set_bit(Replacement, &rdev->flags);
                rdev->raid_disk = repl_slot;
-               err = 0;
-               if (mddev->gendisk)
-                       disk_stack_limits(mddev->gendisk, rdev->bdev,
-                                         rdev->data_offset << 9);
+               err = mddev_stack_new_rdev(mddev, rdev);
+               if (err)
+                       return err;
                conf->fullsync = 1;
                WRITE_ONCE(p->replacement, rdev);
        }
@@ -2290,8 +2263,6 @@ static void end_sync_write(struct bio *bio)
        struct mddev *mddev = r10_bio->mddev;
        struct r10conf *conf = mddev->private;
        int d;
-       sector_t first_bad;
-       int bad_sectors;
        int slot;
        int repl;
        struct md_rdev *rdev = NULL;
@@ -2312,11 +2283,10 @@ static void end_sync_write(struct bio *bio)
                                        &rdev->mddev->recovery);
                        set_bit(R10BIO_WriteError, &r10_bio->state);
                }
-       } else if (is_badblock(rdev,
-                            r10_bio->devs[slot].addr,
-                            r10_bio->sectors,
-                            &first_bad, &bad_sectors))
+       } else if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
+                                    r10_bio->sectors)) {
                set_bit(R10BIO_MadeGood, &r10_bio->state);
+       }
 
        rdev_dec_pending(rdev, mddev);
 
@@ -2597,11 +2567,8 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
                            int sectors, struct page *page, enum req_op op)
 {
-       sector_t first_bad;
-       int bad_sectors;
-
-       if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
-           && (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags)))
+       if (rdev_has_badblock(rdev, sector, sectors) &&
+           (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags)))
                return -1;
        if (sync_page_io(rdev, sector, sectors << 9, page, op, false))
                /* success */
@@ -2658,16 +2625,14 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
                        s = PAGE_SIZE >> 9;
 
                do {
-                       sector_t first_bad;
-                       int bad_sectors;
-
                        d = r10_bio->devs[sl].devnum;
                        rdev = conf->mirrors[d].rdev;
                        if (rdev &&
                            test_bit(In_sync, &rdev->flags) &&
                            !test_bit(Faulty, &rdev->flags) &&
-                           is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
-                                       &first_bad, &bad_sectors) == 0) {
+                           rdev_has_badblock(rdev,
+                                             r10_bio->devs[sl].addr + sect,
+                                             s) == 0) {
                                atomic_inc(&rdev->nr_pending);
                                success = sync_page_io(rdev,
                                                       r10_bio->devs[sl].addr +
@@ -4002,14 +3967,26 @@ static struct r10conf *setup_conf(struct mddev *mddev)
        return ERR_PTR(err);
 }
 
-static void raid10_set_io_opt(struct r10conf *conf)
+static unsigned int raid10_nr_stripes(struct r10conf *conf)
 {
-       int raid_disks = conf->geo.raid_disks;
+       unsigned int raid_disks = conf->geo.raid_disks;
 
-       if (!(conf->geo.raid_disks % conf->geo.near_copies))
-               raid_disks /= conf->geo.near_copies;
-       blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) *
-                        raid_disks);
+       if (conf->geo.raid_disks % conf->geo.near_copies)
+               return raid_disks;
+       return raid_disks / conf->geo.near_copies;
+}
+
+static int raid10_set_queue_limits(struct mddev *mddev)
+{
+       struct r10conf *conf = mddev->private;
+       struct queue_limits lim;
+
+       blk_set_stacking_limits(&lim);
+       lim.max_write_zeroes_sectors = 0;
+       lim.io_min = mddev->chunk_sectors << 9;
+       lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
+       mddev_stack_rdev_limits(mddev, &lim);
+       return queue_limits_set(mddev->gendisk->queue, &lim);
 }
 
 static int raid10_run(struct mddev *mddev)
@@ -4021,6 +3998,7 @@ static int raid10_run(struct mddev *mddev)
        sector_t size;
        sector_t min_offset_diff = 0;
        int first = 1;
+       int ret = -EIO;
 
        if (mddev->private == NULL) {
                conf = setup_conf(mddev);
@@ -4047,12 +4025,6 @@ static int raid10_run(struct mddev *mddev)
                }
        }
 
-       if (mddev->queue) {
-               blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
-               blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
-               raid10_set_io_opt(conf);
-       }
-
        rdev_for_each(rdev, mddev) {
                long long diff;
 
@@ -4081,14 +4053,16 @@ static int raid10_run(struct mddev *mddev)
                if (first || diff < min_offset_diff)
                        min_offset_diff = diff;
 
-               if (mddev->gendisk)
-                       disk_stack_limits(mddev->gendisk, rdev->bdev,
-                                         rdev->data_offset << 9);
-
                disk->head_position = 0;
                first = 0;
        }
 
+       if (!mddev_is_dm(conf->mddev)) {
+               ret = raid10_set_queue_limits(mddev);
+               if (ret)
+                       goto out_free_conf;
+       }
+
        /* need to check that every block has at least one working mirror */
        if (!enough(conf, -1)) {
                pr_err("md/raid10:%s: not enough operational mirrors.\n",
@@ -4185,7 +4159,7 @@ out_free_conf:
        raid10_free_conf(conf);
        mddev->private = NULL;
 out:
-       return -EIO;
+       return ret;
 }
 
 static void raid10_free(struct mddev *mddev, void *priv)
@@ -4954,8 +4928,7 @@ static void end_reshape(struct r10conf *conf)
        conf->reshape_safe = MaxSector;
        spin_unlock_irq(&conf->device_lock);
 
-       if (conf->mddev->queue)
-               raid10_set_io_opt(conf);
+       mddev_update_io_opt(conf->mddev, raid10_nr_stripes(conf));
        conf->fullsync = 0;
 }