drivers/md/raid10.c

   1 /*
   2  * raid10.c : Multiple Devices driver for Linux
   3  *
   4  * Copyright (C) 2000-2004 Neil Brown
   5  *
   6  * RAID-10 support for md.
   7  *
   8  * Base on code in raid1.c.  See raid1.c for further copyright information.
   9  *
  10  *
  11  * This program is free software; you can redistribute it and/or modify
  12  * it under the terms of the GNU General Public License as published by
  13  * the Free Software Foundation; either version 2, or (at your option)
  14  * any later version.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * (for example /usr/src/linux/COPYING); if not, write to the Free
  18  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19  */
  20
  21 #include <linux/slab.h>
  22 #include <linux/delay.h>
  23 #include <linux/blkdev.h>
  24 #include <linux/module.h>
  25 #include <linux/seq_file.h>
  26 #include <linux/ratelimit.h>
  27 #include <linux/kthread.h>
  28 #include <linux/raid/md_p.h>
  29 #include <trace/events/block.h>
  30 #include "md.h"
  31 #include "raid10.h"
  32 #include "raid0.h"
  33 #include "md-bitmap.h"
  34
  35 /*
  36  * RAID10 provides a combination of RAID0 and RAID1 functionality.
  37  * The layout of data is defined by
  38  *    chunk_size
  39  *    raid_disks
  40  *    near_copies (stored in low byte of layout)
  41  *    far_copies (stored in second byte of layout)
  42  *    far_offset (stored in bit 16 of layout )
  43  *    use_far_sets (stored in bit 17 of layout )
  44  *    use_far_sets_bugfixed (stored in bit 18 of layout )
  45  *
  46  * The data to be stored is divided into chunks using chunksize.  Each device
  47  * is divided into far_copies sections.   In each section, chunks are laid out
  48  * in a style similar to raid0, but near_copies copies of each chunk is stored
  49  * (each on a different drive).  The starting device for each section is offset
  50  * near_copies from the starting device of the previous section.  Thus there
  51  * are (near_copies * far_copies) of each chunk, and each is on a different
  52  * drive.  near_copies and far_copies must be at least one, and their product
  53  * is at most raid_disks.
  54  *
  55  * If far_offset is true, then the far_copies are handled a bit differently.
  56  * The copies are still in different stripes, but instead of being very far
  57  * apart on disk, there are adjacent stripes.
  58  *
  59  * The far and offset algorithms are handled slightly differently if
  60  * 'use_far_sets' is true.  In this case, the array's devices are grouped into
  61  * sets that are (near_copies * far_copies) in size.  The far copied stripes
  62  * are still shifted by 'near_copies' devices, but this shifting stays confined
  63  * to the set rather than the entire array.  This is done to improve the number
  64  * of device combinations that can fail without causing the array to fail.
  65  * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
  66  * on a device):
  67  *    A B C D    A B C D E
  68  *      ...         ...
  69  *    D A B C    E A B C D
  70  * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
  71  *    [A B] [C D]    [A B] [C D E]
  72  *    |...| |...|    |...| | ... |
  73  *    [B A] [D C]    [B A] [E C D]
  74  */
  75
  76 /*
  77  * Number of guaranteed r10bios in case of extreme VM load:
  78  */
  79 #define NR_RAID10_BIOS 256
  80
  81 /* when we get a read error on a read-only array, we redirect to another
  82  * device without failing the first device, or trying to over-write to
  83  * correct the read error.  To keep track of bad blocks on a per-bio
  84  * level, we store IO_BLOCKED in the appropriate 'bios' pointer
  85  */
  86 #define IO_BLOCKED ((struct bio *)1)
  87 /* When we successfully write to a known bad-block, we need to remove the
  88  * bad-block marking which must be done from process context.  So we record
  89  * the success by setting devs[n].bio to IO_MADE_GOOD
  90  */
  91 #define IO_MADE_GOOD ((struct bio *)2)
  92
  93 #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
  94
  95 /* When there are this many requests queued to be written by
  96  * the raid10 thread, we become 'congested' to provide back-pressure
  97  * for writeback.
  98  */
  99 static int max_queued_requests = 1024;
 100
 101 static void allow_barrier(struct r10conf *conf);
 102 static void lower_barrier(struct r10conf *conf);
 103 static int _enough(struct r10conf *conf, int previous, int ignore);
 104 static int enough(struct r10conf *conf, int ignore);
 105 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 106                                 int *skipped);
 107 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
 108 static void end_reshape_write(struct bio *bio);
 109 static void end_reshape(struct r10conf *conf);
 110
 111 #define raid10_log(md, fmt, args...)                            \
 112         do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
 113
 114 #include "raid1-10.c"
 115
 116 /*
 117  * for resync bio, r10bio pointer can be retrieved from the per-bio
 118  * 'struct resync_pages'.
 119  */
 120 static inline struct r10bio *get_resync_r10bio(struct bio *bio)
 121 {
 122         return get_resync_pages(bio)->raid_bio;
 123 }
 124
 125 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 126 {
 127         struct r10conf *conf = data;
 128         int size = offsetof(struct r10bio, devs[conf->copies]);
 129
 130         /* allocate a r10bio with room for raid_disks entries in the
 131          * bios array */
 132         return kzalloc(size, gfp_flags);
 133 }
 134
 135 static void r10bio_pool_free(void *r10_bio, void *data)
 136 {
 137         kfree(r10_bio);
 138 }
 139
 140 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
 141 /* amount of memory to reserve for resync requests */
 142 #define RESYNC_WINDOW (1024*1024)
 143 /* maximum number of concurrent requests, memory permitting */
 144 #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
 145 #define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW)
 146 #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
 147
 148 /*
 149  * When performing a resync, we need to read and compare, so
 150  * we need as many pages are there are copies.
 151  * When performing a recovery, we need 2 bios, one for read,
 152  * one for write (we recover only one drive per r10buf)
 153  *
 154  */
 155 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 156 {
 157         struct r10conf *conf = data;
 158         struct r10bio *r10_bio;
 159         struct bio *bio;
 160         int j;
 161         int nalloc, nalloc_rp;
 162         struct resync_pages *rps;
 163
 164         r10_bio = r10bio_pool_alloc(gfp_flags, conf);
 165         if (!r10_bio)
 166                 return NULL;
 167
 168         if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
 169             test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
 170                 nalloc = conf->copies; /* resync */
 171         else
 172                 nalloc = 2; /* recovery */
 173
 174         /* allocate once for all bios */
 175         if (!conf->have_replacement)
 176                 nalloc_rp = nalloc;
 177         else
 178                 nalloc_rp = nalloc * 2;
 179         rps = kmalloc_array(nalloc_rp, sizeof(struct resync_pages), gfp_flags);
 180         if (!rps)
 181                 goto out_free_r10bio;
 182
 183         /*
 184          * Allocate bios.
 185          */
 186         for (j = nalloc ; j-- ; ) {
 187                 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
 188                 if (!bio)
 189                         goto out_free_bio;
 190                 r10_bio->devs[j].bio = bio;
 191                 if (!conf->have_replacement)
 192                         continue;
 193                 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
 194                 if (!bio)
 195                         goto out_free_bio;
 196                 r10_bio->devs[j].repl_bio = bio;
 197         }
 198         /*
 199          * Allocate RESYNC_PAGES data pages and attach them
 200          * where needed.
 201          */
 202         for (j = 0; j < nalloc; j++) {
 203                 struct bio *rbio = r10_bio->devs[j].repl_bio;
 204                 struct resync_pages *rp, *rp_repl;
 205
 206                 rp = &rps[j];
 207                 if (rbio)
 208                         rp_repl = &rps[nalloc + j];
 209
 210                 bio = r10_bio->devs[j].bio;
 211
 212                 if (!j || test_bit(MD_RECOVERY_SYNC,
 213                                    &conf->mddev->recovery)) {
 214                         if (resync_alloc_pages(rp, gfp_flags))
 215                                 goto out_free_pages;
 216                 } else {
 217                         memcpy(rp, &rps[0], sizeof(*rp));
 218                         resync_get_all_pages(rp);
 219                 }
 220
 221                 rp->raid_bio = r10_bio;
 222                 bio->bi_private = rp;
 223                 if (rbio) {
 224                         memcpy(rp_repl, rp, sizeof(*rp));
 225                         rbio->bi_private = rp_repl;
 226                 }
 227         }
 228
 229         return r10_bio;
 230
 231 out_free_pages:
 232         while (--j >= 0)
 233                 resync_free_pages(&rps[j * 2]);
 234
 235         j = 0;
 236 out_free_bio:
 237         for ( ; j < nalloc; j++) {
 238                 if (r10_bio->devs[j].bio)
 239                         bio_put(r10_bio->devs[j].bio);
 240                 if (r10_bio->devs[j].repl_bio)
 241                         bio_put(r10_bio->devs[j].repl_bio);
 242         }
 243         kfree(rps);
 244 out_free_r10bio:
 245         r10bio_pool_free(r10_bio, conf);
 246         return NULL;
 247 }
 248
 249 static void r10buf_pool_free(void *__r10_bio, void *data)
 250 {
 251         struct r10conf *conf = data;
 252         struct r10bio *r10bio = __r10_bio;
 253         int j;
 254         struct resync_pages *rp = NULL;
 255
 256         for (j = conf->copies; j--; ) {
 257                 struct bio *bio = r10bio->devs[j].bio;
 258
 259                 if (bio) {
 260                         rp = get_resync_pages(bio);
 261                         resync_free_pages(rp);
 262                         bio_put(bio);
 263                 }
 264
 265                 bio = r10bio->devs[j].repl_bio;
 266                 if (bio)
 267                         bio_put(bio);
 268         }
 269
 270         /* resync pages array stored in the 1st bio's .bi_private */
 271         kfree(rp);
 272
 273         r10bio_pool_free(r10bio, conf);
 274 }
 275
 276 static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
 277 {
 278         int i;
 279
 280         for (i = 0; i < conf->copies; i++) {
 281                 struct bio **bio = & r10_bio->devs[i].bio;
 282                 if (!BIO_SPECIAL(*bio))
 283                         bio_put(*bio);
 284                 *bio = NULL;
 285                 bio = &r10_bio->devs[i].repl_bio;
 286                 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
 287                         bio_put(*bio);
 288                 *bio = NULL;
 289         }
 290 }
 291
 292 static void free_r10bio(struct r10bio *r10_bio)
 293 {
 294         struct r10conf *conf = r10_bio->mddev->private;
 295
 296         put_all_bios(conf, r10_bio);
 297         mempool_free(r10_bio, &conf->r10bio_pool);
 298 }
 299
 300 static void put_buf(struct r10bio *r10_bio)
 301 {
 302         struct r10conf *conf = r10_bio->mddev->private;
 303
 304         mempool_free(r10_bio, &conf->r10buf_pool);
 305
 306         lower_barrier(conf);
 307 }
 308
 309 static void reschedule_retry(struct r10bio *r10_bio)
 310 {
 311         unsigned long flags;
 312         struct mddev *mddev = r10_bio->mddev;
 313         struct r10conf *conf = mddev->private;
 314
 315         spin_lock_irqsave(&conf->device_lock, flags);
 316         list_add(&r10_bio->retry_list, &conf->retry_list);
 317         conf->nr_queued ++;
 318         spin_unlock_irqrestore(&conf->device_lock, flags);
 319
 320         /* wake up frozen array... */
 321         wake_up(&conf->wait_barrier);
 322
 323         md_wakeup_thread(mddev->thread);
 324 }
 325
 326 /*
 327  * raid_end_bio_io() is called when we have finished servicing a mirrored
 328  * operation and are ready to return a success/failure code to the buffer
 329  * cache layer.
 330  */
 331 static void raid_end_bio_io(struct r10bio *r10_bio)
 332 {
 333         struct bio *bio = r10_bio->master_bio;
 334         struct r10conf *conf = r10_bio->mddev->private;
 335
 336         if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
 337                 bio->bi_status = BLK_STS_IOERR;
 338
 339         bio_endio(bio);
 340         /*
 341          * Wake up any possible resync thread that waits for the device
 342          * to go idle.
 343          */
 344         allow_barrier(conf);
 345
 346         free_r10bio(r10_bio);
 347 }
 348
 349 /*
 350  * Update disk head position estimator based on IRQ completion info.
 351  */
 352 static inline void update_head_pos(int slot, struct r10bio *r10_bio)
 353 {
 354         struct r10conf *conf = r10_bio->mddev->private;
 355
 356         conf->mirrors[r10_bio->devs[slot].devnum].head_position =
 357                 r10_bio->devs[slot].addr + (r10_bio->sectors);
 358 }
 359
 360 /*
 361  * Find the disk number which triggered given bio
 362  */
 363 static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
 364                          struct bio *bio, int *slotp, int *replp)
 365 {
 366         int slot;
 367         int repl = 0;
 368
 369         for (slot = 0; slot < conf->copies; slot++) {
 370                 if (r10_bio->devs[slot].bio == bio)
 371                         break;
 372                 if (r10_bio->devs[slot].repl_bio == bio) {
 373                         repl = 1;
 374                         break;
 375                 }
 376         }
 377
 378         BUG_ON(slot == conf->copies);
 379         update_head_pos(slot, r10_bio);
 380
 381         if (slotp)
 382                 *slotp = slot;
 383         if (replp)
 384                 *replp = repl;
 385         return r10_bio->devs[slot].devnum;
 386 }
 387
 388 static void raid10_end_read_request(struct bio *bio)
 389 {
 390         int uptodate = !bio->bi_status;
 391         struct r10bio *r10_bio = bio->bi_private;
 392         int slot;
 393         struct md_rdev *rdev;
 394         struct r10conf *conf = r10_bio->mddev->private;
 395
 396         slot = r10_bio->read_slot;
 397         rdev = r10_bio->devs[slot].rdev;
 398         /*
 399          * this branch is our 'one mirror IO has finished' event handler:
 400          */
 401         update_head_pos(slot, r10_bio);
 402
 403         if (uptodate) {
 404                 /*
 405                  * Set R10BIO_Uptodate in our master bio, so that
 406                  * we will return a good error code to the higher
 407                  * levels even if IO on some other mirrored buffer fails.
 408                  *
 409                  * The 'master' represents the composite IO operation to
 410                  * user-side. So if something waits for IO, then it will
 411                  * wait for the 'master' bio.
 412                  */
 413                 set_bit(R10BIO_Uptodate, &r10_bio->state);
 414         } else {
 415                 /* If all other devices that store this block have
 416                  * failed, we want to return the error upwards rather
 417                  * than fail the last device.  Here we redefine
 418                  * "uptodate" to mean "Don't want to retry"
 419                  */
 420                 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
 421                              rdev->raid_disk))
 422                         uptodate = 1;
 423         }
 424         if (uptodate) {
 425                 raid_end_bio_io(r10_bio);
 426                 rdev_dec_pending(rdev, conf->mddev);
 427         } else {
 428                 /*
 429                  * oops, read error - keep the refcount on the rdev
 430                  */
 431                 char b[BDEVNAME_SIZE];
 432                 pr_err_ratelimited("md/raid10:%s: %s: rescheduling sector %llu\n",
 433                                    mdname(conf->mddev),
 434                                    bdevname(rdev->bdev, b),
 435                                    (unsigned long long)r10_bio->sector);
 436                 set_bit(R10BIO_ReadError, &r10_bio->state);
 437                 reschedule_retry(r10_bio);
 438         }
 439 }
 440
 441 static void close_write(struct r10bio *r10_bio)
 442 {
 443         /* clear the bitmap if all writes complete successfully */
 444         md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
 445                            r10_bio->sectors,
 446                            !test_bit(R10BIO_Degraded, &r10_bio->state),
 447                            0);
 448         md_write_end(r10_bio->mddev);
 449 }
 450
 451 static void one_write_done(struct r10bio *r10_bio)
 452 {
 453         if (atomic_dec_and_test(&r10_bio->remaining)) {
 454                 if (test_bit(R10BIO_WriteError, &r10_bio->state))
 455                         reschedule_retry(r10_bio);
 456                 else {
 457                         close_write(r10_bio);
 458                         if (test_bit(R10BIO_MadeGood, &r10_bio->state))
 459                                 reschedule_retry(r10_bio);
 460                         else
 461                                 raid_end_bio_io(r10_bio);
 462                 }
 463         }
 464 }
 465
 466 static void raid10_end_write_request(struct bio *bio)
 467 {
 468         struct r10bio *r10_bio = bio->bi_private;
 469         int dev;
 470         int dec_rdev = 1;
 471         struct r10conf *conf = r10_bio->mddev->private;
 472         int slot, repl;
 473         struct md_rdev *rdev = NULL;
 474         struct bio *to_put = NULL;
 475         bool discard_error;
 476
 477         discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
 478
 479         dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
 480
 481         if (repl)
 482                 rdev = conf->mirrors[dev].replacement;
 483         if (!rdev) {
 484                 smp_rmb();
 485                 repl = 0;
 486                 rdev = conf->mirrors[dev].rdev;
 487         }
 488         /*
 489          * this branch is our 'one mirror IO has finished' event handler:
 490          */
 491         if (bio->bi_status && !discard_error) {
 492                 if (repl)
 493                         /* Never record new bad blocks to replacement,
 494                          * just fail it.
 495                          */
 496                         md_error(rdev->mddev, rdev);
 497                 else {
 498                         set_bit(WriteErrorSeen, &rdev->flags);
 499                         if (!test_and_set_bit(WantReplacement, &rdev->flags))
 500                                 set_bit(MD_RECOVERY_NEEDED,
 501                                         &rdev->mddev->recovery);
 502
 503                         dec_rdev = 0;
 504                         if (test_bit(FailFast, &rdev->flags) &&
 505                             (bio->bi_opf & MD_FAILFAST)) {
 506                                 md_error(rdev->mddev, rdev);
 507                                 if (!test_bit(Faulty, &rdev->flags))
 508                                         /* This is the only remaining device,
 509                                          * We need to retry the write without
 510                                          * FailFast
 511                                          */
 512                                         set_bit(R10BIO_WriteError, &r10_bio->state);
 513                                 else {
 514                                         r10_bio->devs[slot].bio = NULL;
 515                                         to_put = bio;
 516                                         dec_rdev = 1;
 517                                 }
 518                         } else
 519                                 set_bit(R10BIO_WriteError, &r10_bio->state);
 520                 }
 521         } else {
 522                 /*
 523                  * Set R10BIO_Uptodate in our master bio, so that
 524                  * we will return a good error code for to the higher
 525                  * levels even if IO on some other mirrored buffer fails.
 526                  *
 527                  * The 'master' represents the composite IO operation to
 528                  * user-side. So if something waits for IO, then it will
 529                  * wait for the 'master' bio.
 530                  */
 531                 sector_t first_bad;
 532                 int bad_sectors;
 533
 534                 /*
 535                  * Do not set R10BIO_Uptodate if the current device is
 536                  * rebuilding or Faulty. This is because we cannot use
 537                  * such device for properly reading the data back (we could
 538                  * potentially use it, if the current write would have felt
 539                  * before rdev->recovery_offset, but for simplicity we don't
 540                  * check this here.
 541                  */
 542                 if (test_bit(In_sync, &rdev->flags) &&
 543                     !test_bit(Faulty, &rdev->flags))
 544                         set_bit(R10BIO_Uptodate, &r10_bio->state);
 545
 546                 /* Maybe we can clear some bad blocks. */
 547                 if (is_badblock(rdev,
 548                                 r10_bio->devs[slot].addr,
 549                                 r10_bio->sectors,
 550                                 &first_bad, &bad_sectors) && !discard_error) {
 551                         bio_put(bio);
 552                         if (repl)
 553                                 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
 554                         else
 555                                 r10_bio->devs[slot].bio = IO_MADE_GOOD;
 556                         dec_rdev = 0;
 557                         set_bit(R10BIO_MadeGood, &r10_bio->state);
 558                 }
 559         }
 560
 561         /*
 562          *
 563          * Let's see if all mirrored write operations have finished
 564          * already.
 565          */
 566         one_write_done(r10_bio);
 567         if (dec_rdev)
 568                 rdev_dec_pending(rdev, conf->mddev);
 569         if (to_put)
 570                 bio_put(to_put);
 571 }
 572
 573 /*
 574  * RAID10 layout manager
 575  * As well as the chunksize and raid_disks count, there are two
 576  * parameters: near_copies and far_copies.
 577  * near_copies * far_copies must be <= raid_disks.
 578  * Normally one of these will be 1.
 579  * If both are 1, we get raid0.
 580  * If near_copies == raid_disks, we get raid1.
 581  *
 582  * Chunks are laid out in raid0 style with near_copies copies of the
 583  * first chunk, followed by near_copies copies of the next chunk and
 584  * so on.
 585  * If far_copies > 1, then after 1/far_copies of the array has been assigned
 586  * as described above, we start again with a device offset of near_copies.
 587  * So we effectively have another copy of the whole array further down all
 588  * the drives, but with blocks on different drives.
 589  * With this layout, and block is never stored twice on the one device.
 590  *
 591  * raid10_find_phys finds the sector offset of a given virtual sector
 592  * on each device that it is on.
 593  *
 594  * raid10_find_virt does the reverse mapping, from a device and a
 595  * sector offset to a virtual address
 596  */
 597
 598 static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
 599 {
 600         int n,f;
 601         sector_t sector;
 602         sector_t chunk;
 603         sector_t stripe;
 604         int dev;
 605         int slot = 0;
 606         int last_far_set_start, last_far_set_size;
 607
 608         last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
 609         last_far_set_start *= geo->far_set_size;
 610
 611         last_far_set_size = geo->far_set_size;
 612         last_far_set_size += (geo->raid_disks % geo->far_set_size);
 613
 614         /* now calculate first sector/dev */
 615         chunk = r10bio->sector >> geo->chunk_shift;
 616         sector = r10bio->sector & geo->chunk_mask;
 617
 618         chunk *= geo->near_copies;
 619         stripe = chunk;
 620         dev = sector_div(stripe, geo->raid_disks);
 621         if (geo->far_offset)
 622                 stripe *= geo->far_copies;
 623
 624         sector += stripe << geo->chunk_shift;
 625
 626         /* and calculate all the others */
 627         for (n = 0; n < geo->near_copies; n++) {
 628                 int d = dev;
 629                 int set;
 630                 sector_t s = sector;
 631                 r10bio->devs[slot].devnum = d;
 632                 r10bio->devs[slot].addr = s;
 633                 slot++;
 634
 635                 for (f = 1; f < geo->far_copies; f++) {
 636                         set = d / geo->far_set_size;
 637                         d += geo->near_copies;
 638
 639                         if ((geo->raid_disks % geo->far_set_size) &&
 640                             (d > last_far_set_start)) {
 641                                 d -= last_far_set_start;
 642                                 d %= last_far_set_size;
 643                                 d += last_far_set_start;
 644                         } else {
 645                                 d %= geo->far_set_size;
 646                                 d += geo->far_set_size * set;
 647                         }
 648                         s += geo->stride;
 649                         r10bio->devs[slot].devnum = d;
 650                         r10bio->devs[slot].addr = s;
 651                         slot++;
 652                 }
 653                 dev++;
 654                 if (dev >= geo->raid_disks) {
 655                         dev = 0;
 656                         sector += (geo->chunk_mask + 1);
 657                 }
 658         }
 659 }
 660
 661 static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
 662 {
 663         struct geom *geo = &conf->geo;
 664
 665         if (conf->reshape_progress != MaxSector &&
 666             ((r10bio->sector >= conf->reshape_progress) !=
 667              conf->mddev->reshape_backwards)) {
 668                 set_bit(R10BIO_Previous, &r10bio->state);
 669                 geo = &conf->prev;
 670         } else
 671                 clear_bit(R10BIO_Previous, &r10bio->state);
 672
 673         __raid10_find_phys(geo, r10bio);
 674 }
 675
 676 static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
 677 {
 678         sector_t offset, chunk, vchunk;
 679         /* Never use conf->prev as this is only called during resync
 680          * or recovery, so reshape isn't happening
 681          */
 682         struct geom *geo = &conf->geo;
 683         int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
 684         int far_set_size = geo->far_set_size;
 685         int last_far_set_start;
 686
 687         if (geo->raid_disks % geo->far_set_size) {
 688                 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
 689                 last_far_set_start *= geo->far_set_size;
 690
 691                 if (dev >= last_far_set_start) {
 692                         far_set_size = geo->far_set_size;
 693                         far_set_size += (geo->raid_disks % geo->far_set_size);
 694                         far_set_start = last_far_set_start;
 695                 }
 696         }
 697
 698         offset = sector & geo->chunk_mask;
 699         if (geo->far_offset) {
 700                 int fc;
 701                 chunk = sector >> geo->chunk_shift;
 702                 fc = sector_div(chunk, geo->far_copies);
 703                 dev -= fc * geo->near_copies;
 704                 if (dev < far_set_start)
 705                         dev += far_set_size;
 706         } else {
 707                 while (sector >= geo->stride) {
 708                         sector -= geo->stride;
 709                         if (dev < (geo->near_copies + far_set_start))
 710                                 dev += far_set_size - geo->near_copies;
 711                         else
 712                                 dev -= geo->near_copies;
 713                 }
 714                 chunk = sector >> geo->chunk_shift;
 715         }
 716         vchunk = chunk * geo->raid_disks + dev;
 717         sector_div(vchunk, geo->near_copies);
 718         return (vchunk << geo->chunk_shift) + offset;
 719 }
 720
 721 /*
 722  * This routine returns the disk from which the requested read should
 723  * be done. There is a per-array 'next expected sequential IO' sector
 724  * number - if this matches on the next IO then we use the last disk.
 725  * There is also a per-disk 'last know head position' sector that is
 726  * maintained from IRQ contexts, both the normal and the resync IO
 727  * completion handlers update this position correctly. If there is no
 728  * perfect sequential match then we pick the disk whose head is closest.
 729  *
 730  * If there are 2 mirrors in the same 2 devices, performance degrades
 731  * because position is mirror, not device based.
 732  *
 733  * The rdev for the device selected will have nr_pending incremented.
 734  */
 735
 736 /*
 737  * FIXME: possibly should rethink readbalancing and do it differently
 738  * depending on near_copies / far_copies geometry.
 739  */
 740 static struct md_rdev *read_balance(struct r10conf *conf,
 741                                     struct r10bio *r10_bio,
 742                                     int *max_sectors)
 743 {
 744         const sector_t this_sector = r10_bio->sector;
 745         int disk, slot;
 746         int sectors = r10_bio->sectors;
 747         int best_good_sectors;
 748         sector_t new_distance, best_dist;
 749         struct md_rdev *best_rdev, *rdev = NULL;
 750         int do_balance;
 751         int best_slot;
 752         struct geom *geo = &conf->geo;
 753
 754         raid10_find_phys(conf, r10_bio);
 755         rcu_read_lock();
 756         best_slot = -1;
 757         best_rdev = NULL;
 758         best_dist = MaxSector;
 759         best_good_sectors = 0;
 760         do_balance = 1;
 761         clear_bit(R10BIO_FailFast, &r10_bio->state);
 762         /*
 763          * Check if we can balance. We can balance on the whole
 764          * device if no resync is going on (recovery is ok), or below
 765          * the resync window. We take the first readable disk when
 766          * above the resync window.
 767          */
 768         if ((conf->mddev->recovery_cp < MaxSector
 769              && (this_sector + sectors >= conf->next_resync)) ||
 770             (mddev_is_clustered(conf->mddev) &&
 771              md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
 772                                             this_sector + sectors)))
 773                 do_balance = 0;
 774
 775         for (slot = 0; slot < conf->copies ; slot++) {
 776                 sector_t first_bad;
 777                 int bad_sectors;
 778                 sector_t dev_sector;
 779
 780                 if (r10_bio->devs[slot].bio == IO_BLOCKED)
 781                         continue;
 782                 disk = r10_bio->devs[slot].devnum;
 783                 rdev = rcu_dereference(conf->mirrors[disk].replacement);
 784                 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
 785                     r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
 786                         rdev = rcu_dereference(conf->mirrors[disk].rdev);
 787                 if (rdev == NULL ||
 788                     test_bit(Faulty, &rdev->flags))
 789                         continue;
 790                 if (!test_bit(In_sync, &rdev->flags) &&
 791                     r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
 792                         continue;
 793
 794                 dev_sector = r10_bio->devs[slot].addr;
 795                 if (is_badblock(rdev, dev_sector, sectors,
 796                                 &first_bad, &bad_sectors)) {
 797                         if (best_dist < MaxSector)
 798                                 /* Already have a better slot */
 799                                 continue;
 800                         if (first_bad <= dev_sector) {
 801                                 /* Cannot read here.  If this is the
 802                                  * 'primary' device, then we must not read
 803                                  * beyond 'bad_sectors' from another device.
 804                                  */
 805                                 bad_sectors -= (dev_sector - first_bad);
 806                                 if (!do_balance && sectors > bad_sectors)
 807                                         sectors = bad_sectors;
 808                                 if (best_good_sectors > sectors)
 809                                         best_good_sectors = sectors;
 810                         } else {
 811                                 sector_t good_sectors =
 812                                         first_bad - dev_sector;
 813                                 if (good_sectors > best_good_sectors) {
 814                                         best_good_sectors = good_sectors;
 815                                         best_slot = slot;
 816                                         best_rdev = rdev;
 817                                 }
 818                                 if (!do_balance)
 819                                         /* Must read from here */
 820                                         break;
 821                         }
 822                         continue;
 823                 } else
 824                         best_good_sectors = sectors;
 825
 826                 if (!do_balance)
 827                         break;
 828
 829                 if (best_slot >= 0)
 830                         /* At least 2 disks to choose from so failfast is OK */
 831                         set_bit(R10BIO_FailFast, &r10_bio->state);
 832                 /* This optimisation is debatable, and completely destroys
 833                  * sequential read speed for 'far copies' arrays.  So only
 834                  * keep it for 'near' arrays, and review those later.
 835                  */
 836                 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
 837                         new_distance = 0;
 838
 839                 /* for far > 1 always use the lowest address */
 840                 else if (geo->far_copies > 1)
 841                         new_distance = r10_bio->devs[slot].addr;
 842                 else
 843                         new_distance = abs(r10_bio->devs[slot].addr -
 844                                            conf->mirrors[disk].head_position);
 845                 if (new_distance < best_dist) {
 846                         best_dist = new_distance;
 847                         best_slot = slot;
 848                         best_rdev = rdev;
 849                 }
 850         }
 851         if (slot >= conf->copies) {
 852                 slot = best_slot;
 853                 rdev = best_rdev;
 854         }
 855
 856         if (slot >= 0) {
 857                 atomic_inc(&rdev->nr_pending);
 858                 r10_bio->read_slot = slot;
 859         } else
 860                 rdev = NULL;
 861         rcu_read_unlock();
 862         *max_sectors = best_good_sectors;
 863
 864         return rdev;
 865 }
 866
 867 static int raid10_congested(struct mddev *mddev, int bits)
 868 {
 869         struct r10conf *conf = mddev->private;
 870         int i, ret = 0;
 871
 872         if ((bits & (1 << WB_async_congested)) &&
 873             conf->pending_count >= max_queued_requests)
 874                 return 1;
 875
 876         rcu_read_lock();
 877         for (i = 0;
 878              (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
 879                      && ret == 0;
 880              i++) {
 881                 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
 882                 if (rdev && !test_bit(Faulty, &rdev->flags)) {
 883                         struct request_queue *q = bdev_get_queue(rdev->bdev);
 884
 885                         ret |= bdi_congested(q->backing_dev_info, bits);
 886                 }
 887         }
 888         rcu_read_unlock();
 889         return ret;
 890 }
 891
 892 static void flush_pending_writes(struct r10conf *conf)
 893 {
 894         /* Any writes that have been queued but are awaiting
 895          * bitmap updates get flushed here.
 896          */
 897         spin_lock_irq(&conf->device_lock);
 898
 899         if (conf->pending_bio_list.head) {
 900                 struct blk_plug plug;
 901                 struct bio *bio;
 902
 903                 bio = bio_list_get(&conf->pending_bio_list);
 904                 conf->pending_count = 0;
 905                 spin_unlock_irq(&conf->device_lock);
 906
 907                 /*
 908                  * As this is called in a wait_event() loop (see freeze_array),
 909                  * current->state might be TASK_UNINTERRUPTIBLE which will
 910                  * cause a warning when we prepare to wait again.  As it is
 911                  * rare that this path is taken, it is perfectly safe to force
 912                  * us to go around the wait_event() loop again, so the warning
 913                  * is a false-positive. Silence the warning by resetting
 914                  * thread state
 915                  */
 916                 __set_current_state(TASK_RUNNING);
 917
 918                 blk_start_plug(&plug);
 919                 /* flush any pending bitmap writes to disk
 920                  * before proceeding w/ I/O */
 921                 md_bitmap_unplug(conf->mddev->bitmap);
 922                 wake_up(&conf->wait_barrier);
 923
 924                 while (bio) { /* submit pending writes */
 925                         struct bio *next = bio->bi_next;
 926                         struct md_rdev *rdev = (void*)bio->bi_disk;
 927                         bio->bi_next = NULL;
 928                         bio_set_dev(bio, rdev->bdev);
 929                         if (test_bit(Faulty, &rdev->flags)) {
 930                                 bio_io_error(bio);
 931                         } else if (unlikely((bio_op(bio) ==  REQ_OP_DISCARD) &&
 932                                             !blk_queue_discard(bio->bi_disk->queue)))
 933                                 /* Just ignore it */
 934                                 bio_endio(bio);
 935                         else
 936                                 generic_make_request(bio);
 937                         bio = next;
 938                 }
 939                 blk_finish_plug(&plug);
 940         } else
 941                 spin_unlock_irq(&conf->device_lock);
 942 }
 943
 944 /* Barriers....
 945  * Sometimes we need to suspend IO while we do something else,
 946  * either some resync/recovery, or reconfigure the array.
 947  * To do this we raise a 'barrier'.
 948  * The 'barrier' is a counter that can be raised multiple times
 949  * to count how many activities are happening which preclude
 950  * normal IO.
 951  * We can only raise the barrier if there is no pending IO.
 952  * i.e. if nr_pending == 0.
 953  * We choose only to raise the barrier if no-one is waiting for the
 954  * barrier to go down.  This means that as soon as an IO request
 955  * is ready, no other operations which require a barrier will start
 956  * until the IO request has had a chance.
 957  *
 958  * So: regular IO calls 'wait_barrier'.  When that returns there
 959  *    is no backgroup IO happening,  It must arrange to call
 960  *    allow_barrier when it has finished its IO.
 961  * backgroup IO calls must call raise_barrier.  Once that returns
 962  *    there is no normal IO happeing.  It must arrange to call
 963  *    lower_barrier when the particular background IO completes.
 964  */
 965
 966 static void raise_barrier(struct r10conf *conf, int force)
 967 {
 968         BUG_ON(force && !conf->barrier);
 969         spin_lock_irq(&conf->resync_lock);
 970
 971         /* Wait until no block IO is waiting (unless 'force') */
 972         wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
 973                             conf->resync_lock);
 974
 975         /* block any new IO from starting */
 976         conf->barrier++;
 977
 978         /* Now wait for all pending IO to complete */
 979         wait_event_lock_irq(conf->wait_barrier,
 980                             !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
 981                             conf->resync_lock);
 982
 983         spin_unlock_irq(&conf->resync_lock);
 984 }
 985
 986 static void lower_barrier(struct r10conf *conf)
 987 {
 988         unsigned long flags;
 989         spin_lock_irqsave(&conf->resync_lock, flags);
 990         conf->barrier--;
 991         spin_unlock_irqrestore(&conf->resync_lock, flags);
 992         wake_up(&conf->wait_barrier);
 993 }
 994
 995 static void wait_barrier(struct r10conf *conf)
 996 {
 997         spin_lock_irq(&conf->resync_lock);
 998         if (conf->barrier) {
 999                 conf->nr_waiting++;
1000                 /* Wait for the barrier to drop.
1001                  * However if there are already pending
1002                  * requests (preventing the barrier from
1003                  * rising completely), and the
1004                  * pre-process bio queue isn't empty,
1005                  * then don't wait, as we need to empty
1006                  * that queue to get the nr_pending
1007                  * count down.
1008                  */
1009                 raid10_log(conf->mddev, "wait barrier");
1010                 wait_event_lock_irq(conf->wait_barrier,
1011                                     !conf->barrier ||
1012                                     (atomic_read(&conf->nr_pending) &&
1013                                      current->bio_list &&
1014                                      (!bio_list_empty(&current->bio_list[0]) ||
1015                                       !bio_list_empty(&current->bio_list[1]))),
1016                                     conf->resync_lock);
1017                 conf->nr_waiting--;
1018                 if (!conf->nr_waiting)
1019                         wake_up(&conf->wait_barrier);
1020         }
1021         atomic_inc(&conf->nr_pending);
1022         spin_unlock_irq(&conf->resync_lock);
1023 }
1024
1025 static void allow_barrier(struct r10conf *conf)
1026 {
1027         if ((atomic_dec_and_test(&conf->nr_pending)) ||
1028                         (conf->array_freeze_pending))
1029                 wake_up(&conf->wait_barrier);
1030 }
1031
1032 static void freeze_array(struct r10conf *conf, int extra)
1033 {
1034         /* stop syncio and normal IO and wait for everything to
1035          * go quiet.
1036          * We increment barrier and nr_waiting, and then
1037          * wait until nr_pending match nr_queued+extra
1038          * This is called in the context of one normal IO request
1039          * that has failed. Thus any sync request that might be pending
1040          * will be blocked by nr_pending, and we need to wait for
1041          * pending IO requests to complete or be queued for re-try.
1042          * Thus the number queued (nr_queued) plus this request (extra)
1043          * must match the number of pending IOs (nr_pending) before
1044          * we continue.
1045          */
1046         spin_lock_irq(&conf->resync_lock);
1047         conf->array_freeze_pending++;
1048         conf->barrier++;
1049         conf->nr_waiting++;
1050         wait_event_lock_irq_cmd(conf->wait_barrier,
1051                                 atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
1052                                 conf->resync_lock,
1053                                 flush_pending_writes(conf));
1054
1055         conf->array_freeze_pending--;
1056         spin_unlock_irq(&conf->resync_lock);
1057 }
1058
1059 static void unfreeze_array(struct r10conf *conf)
1060 {
1061         /* reverse the effect of the freeze */
1062         spin_lock_irq(&conf->resync_lock);
1063         conf->barrier--;
1064         conf->nr_waiting--;
1065         wake_up(&conf->wait_barrier);
1066         spin_unlock_irq(&conf->resync_lock);
1067 }
1068
1069 static sector_t choose_data_offset(struct r10bio *r10_bio,
1070                                    struct md_rdev *rdev)
1071 {
1072         if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1073             test_bit(R10BIO_Previous, &r10_bio->state))
1074                 return rdev->data_offset;
1075         else
1076                 return rdev->new_data_offset;
1077 }
1078
1079 struct raid10_plug_cb {
1080         struct blk_plug_cb      cb;
1081         struct bio_list         pending;
1082         int                     pending_cnt;
1083 };
1084
1085 static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1086 {
1087         struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1088                                                    cb);
1089         struct mddev *mddev = plug->cb.data;
1090         struct r10conf *conf = mddev->private;
1091         struct bio *bio;
1092
1093         if (from_schedule || current->bio_list) {
1094                 spin_lock_irq(&conf->device_lock);
1095                 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1096                 conf->pending_count += plug->pending_cnt;
1097                 spin_unlock_irq(&conf->device_lock);
1098                 wake_up(&conf->wait_barrier);
1099                 md_wakeup_thread(mddev->thread);
1100                 kfree(plug);
1101                 return;
1102         }
1103
1104         /* we aren't scheduling, so we can do the write-out directly. */
1105         bio = bio_list_get(&plug->pending);
1106         md_bitmap_unplug(mddev->bitmap);
1107         wake_up(&conf->wait_barrier);
1108
1109         while (bio) { /* submit pending writes */
1110                 struct bio *next = bio->bi_next;
1111                 struct md_rdev *rdev = (void*)bio->bi_disk;
1112                 bio->bi_next = NULL;
1113                 bio_set_dev(bio, rdev->bdev);
1114                 if (test_bit(Faulty, &rdev->flags)) {
1115                         bio_io_error(bio);
1116                 } else if (unlikely((bio_op(bio) ==  REQ_OP_DISCARD) &&
1117                                     !blk_queue_discard(bio->bi_disk->queue)))
1118                         /* Just ignore it */
1119                         bio_endio(bio);
1120                 else
1121                         generic_make_request(bio);
1122                 bio = next;
1123         }
1124         kfree(plug);
1125 }
1126
1127 static void raid10_read_request(struct mddev *mddev, struct bio *bio,
1128                                 struct r10bio *r10_bio)
1129 {
1130         struct r10conf *conf = mddev->private;
1131         struct bio *read_bio;
1132         const int op = bio_op(bio);
1133         const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1134         int max_sectors;
1135         sector_t sectors;
1136         struct md_rdev *rdev;
1137         char b[BDEVNAME_SIZE];
1138         int slot = r10_bio->read_slot;
1139         struct md_rdev *err_rdev = NULL;
1140         gfp_t gfp = GFP_NOIO;
1141
1142         if (r10_bio->devs[slot].rdev) {
1143                 /*
1144                  * This is an error retry, but we cannot
1145                  * safely dereference the rdev in the r10_bio,
1146                  * we must use the one in conf.
1147                  * If it has already been disconnected (unlikely)
1148                  * we lose the device name in error messages.
1149                  */
1150                 int disk;
1151                 /*
1152                  * As we are blocking raid10, it is a little safer to
1153                  * use __GFP_HIGH.
1154                  */
1155                 gfp = GFP_NOIO | __GFP_HIGH;
1156
1157                 rcu_read_lock();
1158                 disk = r10_bio->devs[slot].devnum;
1159                 err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
1160                 if (err_rdev)
1161                         bdevname(err_rdev->bdev, b);
1162                 else {
1163                         strcpy(b, "???");
1164                         /* This never gets dereferenced */
1165                         err_rdev = r10_bio->devs[slot].rdev;
1166                 }
1167                 rcu_read_unlock();
1168         }
1169         /*
1170          * Register the new request and wait if the reconstruction
1171          * thread has put up a bar for new requests.
1172          * Continue immediately if no resync is active currently.
1173          */
1174         wait_barrier(conf);
1175
1176         sectors = r10_bio->sectors;
1177         while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1178             bio->bi_iter.bi_sector < conf->reshape_progress &&
1179             bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1180                 /*
1181                  * IO spans the reshape position.  Need to wait for reshape to
1182                  * pass
1183                  */
1184                 raid10_log(conf->mddev, "wait reshape");
1185                 allow_barrier(conf);
1186                 wait_event(conf->wait_barrier,
1187                            conf->reshape_progress <= bio->bi_iter.bi_sector ||
1188                            conf->reshape_progress >= bio->bi_iter.bi_sector +
1189                            sectors);
1190                 wait_barrier(conf);
1191         }
1192
1193         rdev = read_balance(conf, r10_bio, &max_sectors);
1194         if (!rdev) {
1195                 if (err_rdev) {
1196                         pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
1197                                             mdname(mddev), b,
1198                                             (unsigned long long)r10_bio->sector);
1199                 }
1200                 raid_end_bio_io(r10_bio);
1201                 return;
1202         }
1203         if (err_rdev)
1204                 pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
1205                                    mdname(mddev),
1206                                    bdevname(rdev->bdev, b),
1207                                    (unsigned long long)r10_bio->sector);
1208         if (max_sectors < bio_sectors(bio)) {
1209                 struct bio *split = bio_split(bio, max_sectors,
1210                                               gfp, &conf->bio_split);
1211                 bio_chain(split, bio);
1212                 generic_make_request(bio);
1213                 bio = split;
1214                 r10_bio->master_bio = bio;
1215                 r10_bio->sectors = max_sectors;
1216         }
1217         slot = r10_bio->read_slot;
1218
1219         read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
1220
1221         r10_bio->devs[slot].bio = read_bio;
1222         r10_bio->devs[slot].rdev = rdev;
1223
1224         read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
1225                 choose_data_offset(r10_bio, rdev);
1226         bio_set_dev(read_bio, rdev->bdev);
1227         read_bio->bi_end_io = raid10_end_read_request;
1228         bio_set_op_attrs(read_bio, op, do_sync);
1229         if (test_bit(FailFast, &rdev->flags) &&
1230             test_bit(R10BIO_FailFast, &r10_bio->state))
1231                 read_bio->bi_opf |= MD_FAILFAST;
1232         read_bio->bi_private = r10_bio;
1233
1234         if (mddev->gendisk)
1235                 trace_block_bio_remap(read_bio->bi_disk->queue,
1236                                       read_bio, disk_devt(mddev->gendisk),
1237                                       r10_bio->sector);
1238         generic_make_request(read_bio);
1239         return;
1240 }
1241
1242 static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
1243                                   struct bio *bio, bool replacement,
1244                                   int n_copy)
1245 {
1246         const int op = bio_op(bio);
1247         const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1248         const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
1249         unsigned long flags;
1250         struct blk_plug_cb *cb;
1251         struct raid10_plug_cb *plug = NULL;
1252         struct r10conf *conf = mddev->private;
1253         struct md_rdev *rdev;
1254         int devnum = r10_bio->devs[n_copy].devnum;
1255         struct bio *mbio;
1256
1257         if (replacement) {
1258                 rdev = conf->mirrors[devnum].replacement;
1259                 if (rdev == NULL) {
1260                         /* Replacement just got moved to main 'rdev' */
1261                         smp_mb();
1262                         rdev = conf->mirrors[devnum].rdev;
1263                 }
1264         } else
1265                 rdev = conf->mirrors[devnum].rdev;
1266
1267         mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
1268         if (replacement)
1269                 r10_bio->devs[n_copy].repl_bio = mbio;
1270         else
1271                 r10_bio->devs[n_copy].bio = mbio;
1272
1273         mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
1274                                    choose_data_offset(r10_bio, rdev));
1275         bio_set_dev(mbio, rdev->bdev);
1276         mbio->bi_end_io = raid10_end_write_request;
1277         bio_set_op_attrs(mbio, op, do_sync | do_fua);
1278         if (!replacement && test_bit(FailFast,
1279                                      &conf->mirrors[devnum].rdev->flags)
1280                          && enough(conf, devnum))
1281                 mbio->bi_opf |= MD_FAILFAST;
1282         mbio->bi_private = r10_bio;
1283
1284         if (conf->mddev->gendisk)
1285                 trace_block_bio_remap(mbio->bi_disk->queue,
1286                                       mbio, disk_devt(conf->mddev->gendisk),
1287                                       r10_bio->sector);
1288         /* flush_pending_writes() needs access to the rdev so...*/
1289         mbio->bi_disk = (void *)rdev;
1290
1291         atomic_inc(&r10_bio->remaining);
1292
1293         cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
1294         if (cb)
1295                 plug = container_of(cb, struct raid10_plug_cb, cb);
1296         else
1297                 plug = NULL;
1298         if (plug) {
1299                 bio_list_add(&plug->pending, mbio);
1300                 plug->pending_cnt++;
1301         } else {
1302                 spin_lock_irqsave(&conf->device_lock, flags);
1303                 bio_list_add(&conf->pending_bio_list, mbio);
1304                 conf->pending_count++;
1305                 spin_unlock_irqrestore(&conf->device_lock, flags);
1306                 md_wakeup_thread(mddev->thread);
1307         }
1308 }
1309
1310 static void raid10_write_request(struct mddev *mddev, struct bio *bio,
1311                                  struct r10bio *r10_bio)
1312 {
1313         struct r10conf *conf = mddev->private;
1314         int i;
1315         struct md_rdev *blocked_rdev;
1316         sector_t sectors;
1317         int max_sectors;
1318
1319         if ((mddev_is_clustered(mddev) &&
1320              md_cluster_ops->area_resyncing(mddev, WRITE,
1321                                             bio->bi_iter.bi_sector,
1322                                             bio_end_sector(bio)))) {
1323                 DEFINE_WAIT(w);
1324                 for (;;) {
1325                         prepare_to_wait(&conf->wait_barrier,
1326                                         &w, TASK_IDLE);
1327                         if (!md_cluster_ops->area_resyncing(mddev, WRITE,
1328                                  bio->bi_iter.bi_sector, bio_end_sector(bio)))
1329                                 break;
1330                         schedule();
1331                 }
1332                 finish_wait(&conf->wait_barrier, &w);
1333         }
1334
1335         /*
1336          * Register the new request and wait if the reconstruction
1337          * thread has put up a bar for new requests.
1338          * Continue immediately if no resync is active currently.
1339          */
1340         wait_barrier(conf);
1341
1342         sectors = r10_bio->sectors;
1343         while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1344             bio->bi_iter.bi_sector < conf->reshape_progress &&
1345             bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1346                 /*
1347                  * IO spans the reshape position.  Need to wait for reshape to
1348                  * pass
1349                  */
1350                 raid10_log(conf->mddev, "wait reshape");
1351                 allow_barrier(conf);
1352                 wait_event(conf->wait_barrier,
1353                            conf->reshape_progress <= bio->bi_iter.bi_sector ||
1354                            conf->reshape_progress >= bio->bi_iter.bi_sector +
1355                            sectors);
1356                 wait_barrier(conf);
1357         }
1358
1359         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1360             (mddev->reshape_backwards
1361              ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
1362                 bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
1363              : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
1364                 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1365                 /* Need to update reshape_position in metadata */
1366                 mddev->reshape_position = conf->reshape_progress;
1367                 set_mask_bits(&mddev->sb_flags, 0,
1368                               BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1369                 md_wakeup_thread(mddev->thread);
1370                 raid10_log(conf->mddev, "wait reshape metadata");
1371                 wait_event(mddev->sb_wait,
1372                            !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
1373
1374                 conf->reshape_safe = mddev->reshape_position;
1375         }
1376
1377         if (conf->pending_count >= max_queued_requests) {
1378                 md_wakeup_thread(mddev->thread);
1379                 raid10_log(mddev, "wait queued");
1380                 wait_event(conf->wait_barrier,
1381                            conf->pending_count < max_queued_requests);
1382         }
1383         /* first select target devices under rcu_lock and
1384          * inc refcount on their rdev.  Record them by setting
1385          * bios[x] to bio
1386          * If there are known/acknowledged bad blocks on any device
1387          * on which we have seen a write error, we want to avoid
1388          * writing to those blocks.  This potentially requires several
1389          * writes to write around the bad blocks.  Each set of writes
1390          * gets its own r10_bio with a set of bios attached.
1391          */
1392
1393         r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
1394         raid10_find_phys(conf, r10_bio);
1395 retry_write:
1396         blocked_rdev = NULL;
1397         rcu_read_lock();
1398         max_sectors = r10_bio->sectors;
1399
1400         for (i = 0;  i < conf->copies; i++) {
1401                 int d = r10_bio->devs[i].devnum;
1402                 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1403                 struct md_rdev *rrdev = rcu_dereference(
1404                         conf->mirrors[d].replacement);
1405                 if (rdev == rrdev)
1406                         rrdev = NULL;
1407                 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1408                         atomic_inc(&rdev->nr_pending);
1409                         blocked_rdev = rdev;
1410                         break;
1411                 }
1412                 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1413                         atomic_inc(&rrdev->nr_pending);
1414                         blocked_rdev = rrdev;
1415                         break;
1416                 }
1417                 if (rdev && (test_bit(Faulty, &rdev->flags)))
1418                         rdev = NULL;
1419                 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1420                         rrdev = NULL;
1421
1422                 r10_bio->devs[i].bio = NULL;
1423                 r10_bio->devs[i].repl_bio = NULL;
1424
1425                 if (!rdev && !rrdev) {
1426                         set_bit(R10BIO_Degraded, &r10_bio->state);
1427                         continue;
1428                 }
1429                 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1430                         sector_t first_bad;
1431                         sector_t dev_sector = r10_bio->devs[i].addr;
1432                         int bad_sectors;
1433                         int is_bad;
1434
1435                         is_bad = is_badblock(rdev, dev_sector, max_sectors,
1436                                              &first_bad, &bad_sectors);
1437                         if (is_bad < 0) {
1438                                 /* Mustn't write here until the bad block
1439                                  * is acknowledged
1440                                  */
1441                                 atomic_inc(&rdev->nr_pending);
1442                                 set_bit(BlockedBadBlocks, &rdev->flags);
1443                                 blocked_rdev = rdev;
1444                                 break;
1445                         }
1446                         if (is_bad && first_bad <= dev_sector) {
1447                                 /* Cannot write here at all */
1448                                 bad_sectors -= (dev_sector - first_bad);
1449                                 if (bad_sectors < max_sectors)
1450                                         /* Mustn't write more than bad_sectors
1451                                          * to other devices yet
1452                                          */
1453                                         max_sectors = bad_sectors;
1454                                 /* We don't set R10BIO_Degraded as that
1455                                  * only applies if the disk is missing,
1456                                  * so it might be re-added, and we want to
1457                                  * know to recover this chunk.
1458                                  * In this case the device is here, and the
1459                                  * fact that this chunk is not in-sync is
1460                                  * recorded in the bad block log.
1461                                  */
1462                                 continue;
1463                         }
1464                         if (is_bad) {
1465                                 int good_sectors = first_bad - dev_sector;
1466                                 if (good_sectors < max_sectors)
1467                                         max_sectors = good_sectors;
1468                         }
1469                 }
1470                 if (rdev) {
1471                         r10_bio->devs[i].bio = bio;
1472                         atomic_inc(&rdev->nr_pending);
1473                 }
1474                 if (rrdev) {
1475                         r10_bio->devs[i].repl_bio = bio;
1476                         atomic_inc(&rrdev->nr_pending);
1477                 }
1478         }
1479         rcu_read_unlock();
1480
1481         if (unlikely(blocked_rdev)) {
1482                 /* Have to wait for this device to get unblocked, then retry */
1483                 int j;
1484                 int d;
1485
1486                 for (j = 0; j < i; j++) {
1487                         if (r10_bio->devs[j].bio) {
1488                                 d = r10_bio->devs[j].devnum;
1489                                 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1490                         }
1491                         if (r10_bio->devs[j].repl_bio) {
1492                                 struct md_rdev *rdev;
1493                                 d = r10_bio->devs[j].devnum;
1494                                 rdev = conf->mirrors[d].replacement;
1495                                 if (!rdev) {
1496                                         /* Race with remove_disk */
1497                                         smp_mb();
1498                                         rdev = conf->mirrors[d].rdev;
1499                                 }
1500                                 rdev_dec_pending(rdev, mddev);
1501                         }
1502                 }
1503                 allow_barrier(conf);
1504                 raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
1505                 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1506                 wait_barrier(conf);
1507                 goto retry_write;
1508         }
1509
1510         if (max_sectors < r10_bio->sectors)
1511                 r10_bio->sectors = max_sectors;
1512
1513         if (r10_bio->sectors < bio_sectors(bio)) {
1514                 struct bio *split = bio_split(bio, r10_bio->sectors,
1515                                               GFP_NOIO, &conf->bio_split);
1516                 bio_chain(split, bio);
1517                 generic_make_request(bio);
1518                 bio = split;
1519                 r10_bio->master_bio = bio;
1520         }
1521
1522         atomic_set(&r10_bio->remaining, 1);
1523         md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1524
1525         for (i = 0; i < conf->copies; i++) {
1526                 if (r10_bio->devs[i].bio)
1527                         raid10_write_one_disk(mddev, r10_bio, bio, false, i);
1528                 if (r10_bio->devs[i].repl_bio)
1529                         raid10_write_one_disk(mddev, r10_bio, bio, true, i);
1530         }
1531         one_write_done(r10_bio);
1532 }
1533
1534 static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
1535 {
1536         struct r10conf *conf = mddev->private;
1537         struct r10bio *r10_bio;
1538
1539         r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
1540
1541         r10_bio->master_bio = bio;
1542         r10_bio->sectors = sectors;
1543
1544         r10_bio->mddev = mddev;
1545         r10_bio->sector = bio->bi_iter.bi_sector;
1546         r10_bio->state = 0;
1547         memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
1548
1549         if (bio_data_dir(bio) == READ)
1550                 raid10_read_request(mddev, bio, r10_bio);
1551         else
1552                 raid10_write_request(mddev, bio, r10_bio);
1553 }
1554
1555 static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
1556 {
1557         struct r10conf *conf = mddev->private;
1558         sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1559         int chunk_sects = chunk_mask + 1;
1560         int sectors = bio_sectors(bio);
1561
1562         if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1563                 md_flush_request(mddev, bio);
1564                 return true;
1565         }
1566
1567         if (!md_write_start(mddev, bio))
1568                 return false;
1569
1570         /*
1571          * If this request crosses a chunk boundary, we need to split
1572          * it.
1573          */
1574         if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1575                      sectors > chunk_sects
1576                      && (conf->geo.near_copies < conf->geo.raid_disks
1577                          || conf->prev.near_copies <
1578                          conf->prev.raid_disks)))
1579                 sectors = chunk_sects -
1580                         (bio->bi_iter.bi_sector &
1581                          (chunk_sects - 1));
1582         __make_request(mddev, bio, sectors);
1583
1584         /* In case raid10d snuck in to freeze_array */
1585         wake_up(&conf->wait_barrier);
1586         return true;
1587 }
1588
1589 static void raid10_status(struct seq_file *seq, struct mddev *mddev)
1590 {
1591         struct r10conf *conf = mddev->private;
1592         int i;
1593
1594         if (conf->geo.near_copies < conf->geo.raid_disks)
1595                 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1596         if (conf->geo.near_copies > 1)
1597                 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1598         if (conf->geo.far_copies > 1) {
1599                 if (conf->geo.far_offset)
1600                         seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1601                 else
1602                         seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1603                 if (conf->geo.far_set_size != conf->geo.raid_disks)
1604                         seq_printf(seq, " %d devices per set", conf->geo.far_set_size);
1605         }
1606         seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1607                                         conf->geo.raid_disks - mddev->degraded);
1608         rcu_read_lock();
1609         for (i = 0; i < conf->geo.raid_disks; i++) {
1610                 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1611                 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1612         }
1613         rcu_read_unlock();
1614         seq_printf(seq, "]");
1615 }
1616
1617 /* check if there are enough drives for
1618  * every block to appear on atleast one.
1619  * Don't consider the device numbered 'ignore'
1620  * as we might be about to remove it.
1621  */
1622 static int _enough(struct r10conf *conf, int previous, int ignore)
1623 {
1624         int first = 0;
1625         int has_enough = 0;
1626         int disks, ncopies;
1627         if (previous) {
1628                 disks = conf->prev.raid_disks;
1629                 ncopies = conf->prev.near_copies;
1630         } else {
1631                 disks = conf->geo.raid_disks;
1632                 ncopies = conf->geo.near_copies;
1633         }
1634
1635         rcu_read_lock();
1636         do {
1637                 int n = conf->copies;
1638                 int cnt = 0;
1639                 int this = first;
1640                 while (n--) {
1641                         struct md_rdev *rdev;
1642                         if (this != ignore &&
1643                             (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1644                             test_bit(In_sync, &rdev->flags))
1645                                 cnt++;
1646                         this = (this+1) % disks;
1647                 }
1648                 if (cnt == 0)
1649                         goto out;
1650                 first = (first + ncopies) % disks;
1651         } while (first != 0);
1652         has_enough = 1;
1653 out:
1654         rcu_read_unlock();
1655         return has_enough;
1656 }
1657
1658 static int enough(struct r10conf *conf, int ignore)
1659 {
1660         /* when calling 'enough', both 'prev' and 'geo' must
1661          * be stable.
1662          * This is ensured if ->reconfig_mutex or ->device_lock
1663          * is held.
1664          */
1665         return _enough(conf, 0, ignore) &&
1666                 _enough(conf, 1, ignore);
1667 }
1668
1669 static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
1670 {
1671         char b[BDEVNAME_SIZE];
1672         struct r10conf *conf = mddev->private;
1673         unsigned long flags;
1674
1675         /*
1676          * If it is not operational, then we have already marked it as dead
1677          * else if it is the last working disks, ignore the error, let the
1678          * next level up know.
1679          * else mark the drive as failed
1680          */
1681         spin_lock_irqsave(&conf->device_lock, flags);
1682         if (test_bit(In_sync, &rdev->flags)
1683             && !enough(conf, rdev->raid_disk)) {
1684                 /*
1685                  * Don't fail the drive, just return an IO error.
1686                  */
1687                 spin_unlock_irqrestore(&conf->device_lock, flags);
1688                 return;
1689         }
1690         if (test_and_clear_bit(In_sync, &rdev->flags))
1691                 mddev->degraded++;
1692         /*
1693          * If recovery is running, make sure it aborts.
1694          */
1695         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1696         set_bit(Blocked, &rdev->flags);
1697         set_bit(Faulty, &rdev->flags);
1698         set_mask_bits(&mddev->sb_flags, 0,
1699                       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1700         spin_unlock_irqrestore(&conf->device_lock, flags);
1701         pr_crit("md/raid10:%s: Disk failure on %s, disabling device.\n"
1702                 "md/raid10:%s: Operation continuing on %d devices.\n",
1703                 mdname(mddev), bdevname(rdev->bdev, b),
1704                 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1705 }
1706
1707 static void print_conf(struct r10conf *conf)
1708 {
1709         int i;
1710         struct md_rdev *rdev;
1711
1712         pr_debug("RAID10 conf printout:\n");
1713         if (!conf) {
1714                 pr_debug("(!conf)\n");
1715                 return;
1716         }
1717         pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1718                  conf->geo.raid_disks);
1719
1720         /* This is only called with ->reconfix_mutex held, so
1721          * rcu protection of rdev is not needed */
1722         for (i = 0; i < conf->geo.raid_disks; i++) {
1723                 char b[BDEVNAME_SIZE];
1724                 rdev = conf->mirrors[i].rdev;
1725                 if (rdev)
1726                         pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n",
1727                                  i, !test_bit(In_sync, &rdev->flags),
1728                                  !test_bit(Faulty, &rdev->flags),
1729                                  bdevname(rdev->bdev,b));
1730         }
1731 }
1732
1733 static void close_sync(struct r10conf *conf)
1734 {
1735         wait_barrier(conf);
1736         allow_barrier(conf);
1737
1738         mempool_exit(&conf->r10buf_pool);
1739 }
1740
1741 static int raid10_spare_active(struct mddev *mddev)
1742 {
1743         int i;
1744         struct r10conf *conf = mddev->private;
1745         struct raid10_info *tmp;
1746         int count = 0;
1747         unsigned long flags;
1748
1749         /*
1750          * Find all non-in_sync disks within the RAID10 configuration
1751          * and mark them in_sync
1752          */
1753         for (i = 0; i < conf->geo.raid_disks; i++) {
1754                 tmp = conf->mirrors + i;
1755                 if (tmp->replacement
1756                     && tmp->replacement->recovery_offset == MaxSector
1757                     && !test_bit(Faulty, &tmp->replacement->flags)
1758                     && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1759                         /* Replacement has just become active */
1760                         if (!tmp->rdev
1761                             || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1762                                 count++;
1763                         if (tmp->rdev) {
1764                                 /* Replaced device not technically faulty,
1765                                  * but we need to be sure it gets removed
1766                                  * and never re-added.
1767                                  */
1768                                 set_bit(Faulty, &tmp->rdev->flags);
1769                                 sysfs_notify_dirent_safe(
1770                                         tmp->rdev->sysfs_state);
1771                         }
1772                         sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1773                 } else if (tmp->rdev
1774                            && tmp->rdev->recovery_offset == MaxSector
1775                            && !test_bit(Faulty, &tmp->rdev->flags)
1776                            && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1777                         count++;
1778                         sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1779                 }
1780         }
1781         spin_lock_irqsave(&conf->device_lock, flags);
1782         mddev->degraded -= count;
1783         spin_unlock_irqrestore(&conf->device_lock, flags);
1784
1785         print_conf(conf);
1786         return count;
1787 }
1788
1789 static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1790 {
1791         struct r10conf *conf = mddev->private;
1792         int err = -EEXIST;
1793         int mirror;
1794         int first = 0;
1795         int last = conf->geo.raid_disks - 1;
1796
1797         if (mddev->recovery_cp < MaxSector)
1798                 /* only hot-add to in-sync arrays, as recovery is
1799                  * very different from resync
1800                  */
1801                 return -EBUSY;
1802         if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
1803                 return -EINVAL;
1804
1805         if (md_integrity_add_rdev(rdev, mddev))
1806                 return -ENXIO;
1807
1808         if (rdev->raid_disk >= 0)
1809                 first = last = rdev->raid_disk;
1810
1811         if (rdev->saved_raid_disk >= first &&
1812             rdev->saved_raid_disk < conf->geo.raid_disks &&
1813             conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1814                 mirror = rdev->saved_raid_disk;
1815         else
1816                 mirror = first;
1817         for ( ; mirror <= last ; mirror++) {
1818                 struct raid10_info *p = &conf->mirrors[mirror];
1819                 if (p->recovery_disabled == mddev->recovery_disabled)
1820                         continue;
1821                 if (p->rdev) {
1822                         if (!test_bit(WantReplacement, &p->rdev->flags) ||
1823                             p->replacement != NULL)
1824                                 continue;
1825                         clear_bit(In_sync, &rdev->flags);
1826                         set_bit(Replacement, &rdev->flags);
1827                         rdev->raid_disk = mirror;
1828                         err = 0;
1829                         if (mddev->gendisk)
1830                                 disk_stack_limits(mddev->gendisk, rdev->bdev,
1831                                                   rdev->data_offset << 9);
1832                         conf->fullsync = 1;
1833                         rcu_assign_pointer(p->replacement, rdev);
1834                         break;
1835                 }
1836
1837                 if (mddev->gendisk)
1838                         disk_stack_limits(mddev->gendisk, rdev->bdev,
1839                                           rdev->data_offset << 9);
1840
1841                 p->head_position = 0;
1842                 p->recovery_disabled = mddev->recovery_disabled - 1;
1843                 rdev->raid_disk = mirror;
1844                 err = 0;
1845                 if (rdev->saved_raid_disk != mirror)
1846                         conf->fullsync = 1;
1847                 rcu_assign_pointer(p->rdev, rdev);
1848                 break;
1849         }
1850         if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1851                 blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue);
1852
1853         print_conf(conf);
1854         return err;
1855 }
1856
1857 static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1858 {
1859         struct r10conf *conf = mddev->private;
1860         int err = 0;
1861         int number = rdev->raid_disk;
1862         struct md_rdev **rdevp;
1863         struct raid10_info *p = conf->mirrors + number;
1864
1865         print_conf(conf);
1866         if (rdev == p->rdev)
1867                 rdevp = &p->rdev;
1868         else if (rdev == p->replacement)
1869                 rdevp = &p->replacement;
1870         else
1871                 return 0;
1872
1873         if (test_bit(In_sync, &rdev->flags) ||
1874             atomic_read(&rdev->nr_pending)) {
1875                 err = -EBUSY;
1876                 goto abort;
1877         }
1878         /* Only remove non-faulty devices if recovery
1879          * is not possible.
1880          */
1881         if (!test_bit(Faulty, &rdev->flags) &&
1882             mddev->recovery_disabled != p->recovery_disabled &&
1883             (!p->replacement || p->replacement == rdev) &&
1884             number < conf->geo.raid_disks &&
1885             enough(conf, -1)) {
1886                 err = -EBUSY;
1887                 goto abort;
1888         }
1889         *rdevp = NULL;
1890         if (!test_bit(RemoveSynchronized, &rdev->flags)) {
1891                 synchronize_rcu();
1892                 if (atomic_read(&rdev->nr_pending)) {
1893                         /* lost the race, try later */
1894                         err = -EBUSY;
1895                         *rdevp = rdev;
1896                         goto abort;
1897                 }
1898         }
1899         if (p->replacement) {
1900                 /* We must have just cleared 'rdev' */
1901                 p->rdev = p->replacement;
1902                 clear_bit(Replacement, &p->replacement->flags);
1903                 smp_mb(); /* Make sure other CPUs may see both as identical
1904                            * but will never see neither -- if they are careful.
1905                            */
1906                 p->replacement = NULL;
1907         }
1908
1909         clear_bit(WantReplacement, &rdev->flags);
1910         err = md_integrity_register(mddev);
1911
1912 abort:
1913
1914         print_conf(conf);
1915         return err;
1916 }
1917
1918 static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
1919 {
1920         struct r10conf *conf = r10_bio->mddev->private;
1921
1922         if (!bio->bi_status)
1923                 set_bit(R10BIO_Uptodate, &r10_bio->state);
1924         else
1925                 /* The write handler will notice the lack of
1926                  * R10BIO_Uptodate and record any errors etc
1927                  */
1928                 atomic_add(r10_bio->sectors,
1929                            &conf->mirrors[d].rdev->corrected_errors);
1930
1931         /* for reconstruct, we always reschedule after a read.
1932          * for resync, only after all reads
1933          */
1934         rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1935         if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1936             atomic_dec_and_test(&r10_bio->remaining)) {
1937                 /* we have read all the blocks,
1938                  * do the comparison in process context in raid10d
1939                  */
1940                 reschedule_retry(r10_bio);
1941         }
1942 }
1943
1944 static void end_sync_read(struct bio *bio)
1945 {
1946         struct r10bio *r10_bio = get_resync_r10bio(bio);
1947         struct r10conf *conf = r10_bio->mddev->private;
1948         int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1949
1950         __end_sync_read(r10_bio, bio, d);
1951 }
1952
1953 static void end_reshape_read(struct bio *bio)
1954 {
1955         /* reshape read bio isn't allocated from r10buf_pool */
1956         struct r10bio *r10_bio = bio->bi_private;
1957
1958         __end_sync_read(r10_bio, bio, r10_bio->read_slot);
1959 }
1960
1961 static void end_sync_request(struct r10bio *r10_bio)
1962 {
1963         struct mddev *mddev = r10_bio->mddev;
1964
1965         while (atomic_dec_and_test(&r10_bio->remaining)) {
1966                 if (r10_bio->master_bio == NULL) {
1967                         /* the primary of several recovery bios */
1968                         sector_t s = r10_bio->sectors;
1969                         if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1970                             test_bit(R10BIO_WriteError, &r10_bio->state))
1971                                 reschedule_retry(r10_bio);
1972                         else
1973                                 put_buf(r10_bio);
1974                         md_done_sync(mddev, s, 1);
1975                         break;
1976                 } else {
1977                         struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1978                         if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1979                             test_bit(R10BIO_WriteError, &r10_bio->state))
1980                                 reschedule_retry(r10_bio);
1981                         else
1982                                 put_buf(r10_bio);
1983                         r10_bio = r10_bio2;
1984                 }
1985         }
1986 }
1987
1988 static void end_sync_write(struct bio *bio)
1989 {
1990         struct r10bio *r10_bio = get_resync_r10bio(bio);
1991         struct mddev *mddev = r10_bio->mddev;
1992         struct r10conf *conf = mddev->private;
1993         int d;
1994         sector_t first_bad;
1995         int bad_sectors;
1996         int slot;
1997         int repl;
1998         struct md_rdev *rdev = NULL;
1999
2000         d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
2001         if (repl)
2002                 rdev = conf->mirrors[d].replacement;
2003         else
2004                 rdev = conf->mirrors[d].rdev;
2005
2006         if (bio->bi_status) {
2007                 if (repl)
2008                         md_error(mddev, rdev);
2009                 else {
2010                         set_bit(WriteErrorSeen, &rdev->flags);
2011                         if (!test_and_set_bit(WantReplacement, &rdev->flags))
2012                                 set_bit(MD_RECOVERY_NEEDED,
2013                                         &rdev->mddev->recovery);
2014                         set_bit(R10BIO_WriteError, &r10_bio->state);
2015                 }
2016         } else if (is_badblock(rdev,
2017                              r10_bio->devs[slot].addr,
2018                              r10_bio->sectors,
2019                              &first_bad, &bad_sectors))
2020                 set_bit(R10BIO_MadeGood, &r10_bio->state);
2021
2022         rdev_dec_pending(rdev, mddev);
2023
2024         end_sync_request(r10_bio);
2025 }
2026
2027 /*
2028  * Note: sync and recover and handled very differently for raid10
2029  * This code is for resync.
2030  * For resync, we read through virtual addresses and read all blocks.
2031  * If there is any error, we schedule a write.  The lowest numbered
2032  * drive is authoritative.
2033  * However requests come for physical address, so we need to map.
2034  * For every physical address there are raid_disks/copies virtual addresses,
2035  * which is always are least one, but is not necessarly an integer.
2036  * This means that a physical address can span multiple chunks, so we may
2037  * have to submit multiple io requests for a single sync request.
2038  */
2039 /*
2040  * We check if all blocks are in-sync and only write to blocks that
2041  * aren't in sync
2042  */
2043 static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2044 {
2045         struct r10conf *conf = mddev->private;
2046         int i, first;
2047         struct bio *tbio, *fbio;
2048         int vcnt;
2049         struct page **tpages, **fpages;
2050
2051         atomic_set(&r10_bio->remaining, 1);
2052
2053         /* find the first device with a block */
2054         for (i=0; i<conf->copies; i++)
2055                 if (!r10_bio->devs[i].bio->bi_status)
2056                         break;
2057
2058         if (i == conf->copies)
2059                 goto done;
2060
2061         first = i;
2062         fbio = r10_bio->devs[i].bio;
2063         fbio->bi_iter.bi_size = r10_bio->sectors << 9;
2064         fbio->bi_iter.bi_idx = 0;
2065         fpages = get_resync_pages(fbio)->pages;
2066
2067         vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2068         /* now find blocks with errors */
2069         for (i=0 ; i < conf->copies ; i++) {
2070                 int  j, d;
2071                 struct md_rdev *rdev;
2072                 struct resync_pages *rp;
2073
2074                 tbio = r10_bio->devs[i].bio;
2075
2076                 if (tbio->bi_end_io != end_sync_read)
2077                         continue;
2078                 if (i == first)
2079                         continue;
2080
2081                 tpages = get_resync_pages(tbio)->pages;
2082                 d = r10_bio->devs[i].devnum;
2083                 rdev = conf->mirrors[d].rdev;
2084                 if (!r10_bio->devs[i].bio->bi_status) {
2085                         /* We know that the bi_io_vec layout is the same for
2086                          * both 'first' and 'i', so we just compare them.
2087                          * All vec entries are PAGE_SIZE;
2088                          */
2089                         int sectors = r10_bio->sectors;
2090                         for (j = 0; j < vcnt; j++) {
2091                                 int len = PAGE_SIZE;
2092                                 if (sectors < (len / 512))
2093                                         len = sectors * 512;
2094                                 if (memcmp(page_address(fpages[j]),
2095                                            page_address(tpages[j]),
2096                                            len))
2097                                         break;
2098                                 sectors -= len/512;
2099                         }
2100                         if (j == vcnt)
2101                                 continue;
2102                         atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2103                         if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2104                                 /* Don't fix anything. */
2105                                 continue;
2106                 } else if (test_bit(FailFast, &rdev->flags)) {
2107                         /* Just give up on this device */
2108                         md_error(rdev->mddev, rdev);
2109                         continue;
2110                 }
2111                 /* Ok, we need to write this bio, either to correct an
2112                  * inconsistency or to correct an unreadable block.
2113                  * First we need to fixup bv_offset, bv_len and
2114                  * bi_vecs, as the read request might have corrupted these
2115                  */
2116                 rp = get_resync_pages(tbio);
2117                 bio_reset(tbio);
2118
2119                 md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size);
2120
2121                 rp->raid_bio = r10_bio;
2122                 tbio->bi_private = rp;
2123                 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
2124                 tbio->bi_end_io = end_sync_write;
2125                 bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
2126
2127                 bio_copy_data(tbio, fbio);
2128
2129                 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2130                 atomic_inc(&r10_bio->remaining);
2131                 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2132
2133                 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
2134                         tbio->bi_opf |= MD_FAILFAST;
2135                 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2136                 bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
2137                 generic_make_request(tbio);
2138         }
2139
2140         /* Now write out to any replacement devices
2141          * that are active
2142          */
2143         for (i = 0; i < conf->copies; i++) {
2144                 int d;
2145
2146                 tbio = r10_bio->devs[i].repl_bio;
2147                 if (!tbio || !tbio->bi_end_io)
2148                         continue;
2149                 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2150                     && r10_bio->devs[i].bio != fbio)
2151                         bio_copy_data(tbio, fbio);
2152                 d = r10_bio->devs[i].devnum;
2153                 atomic_inc(&r10_bio->remaining);
2154                 md_sync_acct(conf->mirrors[d].replacement->bdev,
2155                              bio_sectors(tbio));
2156                 generic_make_request(tbio);
2157         }
2158
2159 done:
2160         if (atomic_dec_and_test(&r10_bio->remaining)) {
2161                 md_done_sync(mddev, r10_bio->sectors, 1);
2162                 put_buf(r10_bio);
2163         }
2164 }
2165
2166 /*
2167  * Now for the recovery code.
2168  * Recovery happens across physical sectors.
2169  * We recover all non-is_sync drives by finding the virtual address of
2170  * each, and then choose a working drive that also has that virt address.
2171  * There is a separate r10_bio for each non-in_sync drive.
2172  * Only the first two slots are in use. The first for reading,
2173  * The second for writing.
2174  *
2175  */
2176 static void fix_recovery_read_error(struct r10bio *r10_bio)
2177 {
2178         /* We got a read error during recovery.
2179          * We repeat the read in smaller page-sized sections.
2180          * If a read succeeds, write it to the new device or record
2181          * a bad block if we cannot.
2182          * If a read fails, record a bad block on both old and
2183          * new devices.
2184          */
2185         struct mddev *mddev = r10_bio->mddev;
2186         struct r10conf *conf = mddev->private;
2187         struct bio *bio = r10_bio->devs[0].bio;
2188         sector_t sect = 0;
2189         int sectors = r10_bio->sectors;
2190         int idx = 0;
2191         int dr = r10_bio->devs[0].devnum;
2192         int dw = r10_bio->devs[1].devnum;
2193         struct page **pages = get_resync_pages(bio)->pages;
2194
2195         while (sectors) {
2196                 int s = sectors;
2197                 struct md_rdev *rdev;
2198                 sector_t addr;
2199                 int ok;
2200
2201                 if (s > (PAGE_SIZE>>9))
2202                         s = PAGE_SIZE >> 9;
2203
2204                 rdev = conf->mirrors[dr].rdev;
2205                 addr = r10_bio->devs[0].addr + sect,
2206                 ok = sync_page_io(rdev,
2207                                   addr,
2208                                   s << 9,
2209                                   pages[idx],
2210                                   REQ_OP_READ, 0, false);
2211                 if (ok) {
2212                         rdev = conf->mirrors[dw].rdev;
2213                         addr = r10_bio->devs[1].addr + sect;
2214                         ok = sync_page_io(rdev,
2215                                           addr,
2216                                           s << 9,
2217                                           pages[idx],
2218                                           REQ_OP_WRITE, 0, false);
2219                         if (!ok) {
2220                                 set_bit(WriteErrorSeen, &rdev->flags);
2221                                 if (!test_and_set_bit(WantReplacement,
2222                                                       &rdev->flags))
2223                                         set_bit(MD_RECOVERY_NEEDED,
2224                                                 &rdev->mddev->recovery);
2225                         }
2226                 }
2227                 if (!ok) {
2228                         /* We don't worry if we cannot set a bad block -
2229                          * it really is bad so there is no loss in not
2230                          * recording it yet
2231                          */
2232                         rdev_set_badblocks(rdev, addr, s, 0);
2233
2234                         if (rdev != conf->mirrors[dw].rdev) {
2235                                 /* need bad block on destination too */
2236                                 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2237                                 addr = r10_bio->devs[1].addr + sect;
2238                                 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2239                                 if (!ok) {
2240                                         /* just abort the recovery */
2241                                         pr_notice("md/raid10:%s: recovery aborted due to read error\n",
2242                                                   mdname(mddev));
2243
2244                                         conf->mirrors[dw].recovery_disabled
2245                                                 = mddev->recovery_disabled;
2246                                         set_bit(MD_RECOVERY_INTR,
2247                                                 &mddev->recovery);
2248                                         break;
2249                                 }
2250                         }
2251                 }
2252
2253                 sectors -= s;
2254                 sect += s;
2255                 idx++;
2256         }
2257 }
2258
2259 static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2260 {
2261         struct r10conf *conf = mddev->private;
2262         int d;
2263         struct bio *wbio, *wbio2;
2264
2265         if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2266                 fix_recovery_read_error(r10_bio);
2267                 end_sync_request(r10_bio);
2268                 return;
2269         }
2270
2271         /*
2272          * share the pages with the first bio
2273          * and submit the write request
2274          */
2275         d = r10_bio->devs[1].devnum;
2276         wbio = r10_bio->devs[1].bio;
2277         wbio2 = r10_bio->devs[1].repl_bio;
2278         /* Need to test wbio2->bi_end_io before we call
2279          * generic_make_request as if the former is NULL,
2280          * the latter is free to free wbio2.
2281          */
2282         if (wbio2 && !wbio2->bi_end_io)
2283                 wbio2 = NULL;
2284         if (wbio->bi_end_io) {
2285                 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2286                 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2287                 generic_make_request(wbio);
2288         }
2289         if (wbio2) {
2290                 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2291                 md_sync_acct(conf->mirrors[d].replacement->bdev,
2292                              bio_sectors(wbio2));
2293                 generic_make_request(wbio2);
2294         }
2295 }
2296
2297 /*
2298  * Used by fix_read_error() to decay the per rdev read_errors.
2299  * We halve the read error count for every hour that has elapsed
2300  * since the last recorded read error.
2301  *
2302  */
2303 static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2304 {
2305         long cur_time_mon;
2306         unsigned long hours_since_last;
2307         unsigned int read_errors = atomic_read(&rdev->read_errors);
2308
2309         cur_time_mon = ktime_get_seconds();
2310
2311         if (rdev->last_read_error == 0) {
2312                 /* first time we've seen a read error */
2313                 rdev->last_read_error = cur_time_mon;
2314                 return;
2315         }
2316
2317         hours_since_last = (long)(cur_time_mon -
2318                             rdev->last_read_error) / 3600;
2319
2320         rdev->last_read_error = cur_time_mon;
2321
2322         /*
2323          * if hours_since_last is > the number of bits in read_errors
2324          * just set read errors to 0. We do this to avoid
2325          * overflowing the shift of read_errors by hours_since_last.
2326          */
2327         if (hours_since_last >= 8 * sizeof(read_errors))
2328                 atomic_set(&rdev->read_errors, 0);
2329         else
2330                 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2331 }
2332
2333 static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2334                             int sectors, struct page *page, int rw)
2335 {
2336         sector_t first_bad;
2337         int bad_sectors;
2338
2339         if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2340             && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2341                 return -1;
2342         if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false))
2343                 /* success */
2344                 return 1;
2345         if (rw == WRITE) {
2346                 set_bit(WriteErrorSeen, &rdev->flags);
2347                 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2348                         set_bit(MD_RECOVERY_NEEDED,
2349                                 &rdev->mddev->recovery);
2350         }
2351         /* need to record an error - either for the block or the device */
2352         if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2353                 md_error(rdev->mddev, rdev);
2354         return 0;
2355 }
2356
2357 /*
2358  * This is a kernel thread which:
2359  *
2360  *      1.      Retries failed read operations on working mirrors.
2361  *      2.      Updates the raid superblock when problems encounter.
2362  *      3.      Performs writes following reads for array synchronising.
2363  */
2364
2365 static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2366 {
2367         int sect = 0; /* Offset from r10_bio->sector */
2368         int sectors = r10_bio->sectors;
2369         struct md_rdev *rdev;
2370         int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2371         int d = r10_bio->devs[r10_bio->read_slot].devnum;
2372
2373         /* still own a reference to this rdev, so it cannot
2374          * have been cleared recently.
2375          */
2376         rdev = conf->mirrors[d].rdev;
2377
2378         if (test_bit(Faulty, &rdev->flags))
2379                 /* drive has already been failed, just ignore any
2380                    more fix_read_error() attempts */
2381                 return;
2382
2383         check_decay_read_errors(mddev, rdev);
2384         atomic_inc(&rdev->read_errors);
2385         if (atomic_read(&rdev->read_errors) > max_read_errors) {
2386                 char b[BDEVNAME_SIZE];
2387                 bdevname(rdev->bdev, b);
2388
2389                 pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n",
2390                           mdname(mddev), b,
2391                           atomic_read(&rdev->read_errors), max_read_errors);
2392                 pr_notice("md/raid10:%s: %s: Failing raid device\n",
2393                           mdname(mddev), b);
2394                 md_error(mddev, rdev);
2395                 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2396                 return;
2397         }
2398
2399         while(sectors) {
2400                 int s = sectors;
2401                 int sl = r10_bio->read_slot;
2402                 int success = 0;
2403                 int start;
2404
2405                 if (s > (PAGE_SIZE>>9))
2406                         s = PAGE_SIZE >> 9;
2407
2408                 rcu_read_lock();
2409                 do {
2410                         sector_t first_bad;
2411                         int bad_sectors;
2412
2413                         d = r10_bio->devs[sl].devnum;
2414                         rdev = rcu_dereference(conf->mirrors[d].rdev);
2415                         if (rdev &&
2416                             test_bit(In_sync, &rdev->flags) &&
2417                             !test_bit(Faulty, &rdev->flags) &&
2418                             is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2419                                         &first_bad, &bad_sectors) == 0) {
2420                                 atomic_inc(&rdev->nr_pending);
2421                                 rcu_read_unlock();
2422                                 success = sync_page_io(rdev,
2423                                                        r10_bio->devs[sl].addr +
2424                                                        sect,
2425                                                        s<<9,
2426                                                        conf->tmppage,
2427                                                        REQ_OP_READ, 0, false);
2428                                 rdev_dec_pending(rdev, mddev);
2429                                 rcu_read_lock();
2430                                 if (success)
2431                                         break;
2432                         }
2433                         sl++;
2434                         if (sl == conf->copies)
2435                                 sl = 0;
2436                 } while (!success && sl != r10_bio->read_slot);
2437                 rcu_read_unlock();
2438
2439                 if (!success) {
2440                         /* Cannot read from anywhere, just mark the block
2441                          * as bad on the first device to discourage future
2442                          * reads.
2443                          */
2444                         int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2445                         rdev = conf->mirrors[dn].rdev;
2446
2447                         if (!rdev_set_badblocks(
2448                                     rdev,
2449                                     r10_bio->devs[r10_bio->read_slot].addr
2450                                     + sect,
2451                                     s, 0)) {
2452                                 md_error(mddev, rdev);
2453                                 r10_bio->devs[r10_bio->read_slot].bio
2454                                         = IO_BLOCKED;
2455                         }
2456                         break;
2457                 }
2458
2459                 start = sl;
2460                 /* write it back and re-read */
2461                 rcu_read_lock();
2462                 while (sl != r10_bio->read_slot) {
2463                         char b[BDEVNAME_SIZE];
2464
2465                         if (sl==0)
2466                                 sl = conf->copies;
2467                         sl--;
2468                         d = r10_bio->devs[sl].devnum;
2469                         rdev = rcu_dereference(conf->mirrors[d].rdev);
2470                         if (!rdev ||
2471                             test_bit(Faulty, &rdev->flags) ||
2472                             !test_bit(In_sync, &rdev->flags))
2473                                 continue;
2474
2475                         atomic_inc(&rdev->nr_pending);
2476                         rcu_read_unlock();
2477                         if (r10_sync_page_io(rdev,
2478                                              r10_bio->devs[sl].addr +
2479                                              sect,
2480                                              s, conf->tmppage, WRITE)
2481                             == 0) {
2482                                 /* Well, this device is dead */
2483                                 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %s)\n",
2484                                           mdname(mddev), s,
2485                                           (unsigned long long)(
2486                                                   sect +
2487                                                   choose_data_offset(r10_bio,
2488                                                                      rdev)),
2489                                           bdevname(rdev->bdev, b));
2490                                 pr_notice("md/raid10:%s: %s: failing drive\n",
2491                                           mdname(mddev),
2492                                           bdevname(rdev->bdev, b));
2493                         }
2494                         rdev_dec_pending(rdev, mddev);
2495                         rcu_read_lock();
2496                 }
2497                 sl = start;
2498                 while (sl != r10_bio->read_slot) {
2499                         char b[BDEVNAME_SIZE];
2500
2501                         if (sl==0)
2502                                 sl = conf->copies;
2503                         sl--;
2504                         d = r10_bio->devs[sl].devnum;
2505                         rdev = rcu_dereference(conf->mirrors[d].rdev);
2506                         if (!rdev ||
2507                             test_bit(Faulty, &rdev->flags) ||
2508                             !test_bit(In_sync, &rdev->flags))
2509                                 continue;
2510
2511                         atomic_inc(&rdev->nr_pending);
2512                         rcu_read_unlock();
2513                         switch (r10_sync_page_io(rdev,
2514                                              r10_bio->devs[sl].addr +
2515                                              sect,
2516                                              s, conf->tmppage,
2517                                                  READ)) {
2518                         case 0:
2519                                 /* Well, this device is dead */
2520                                 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)\n",
2521                                        mdname(mddev), s,
2522                                        (unsigned long long)(
2523                                                sect +
2524                                                choose_data_offset(r10_bio, rdev)),
2525                                        bdevname(rdev->bdev, b));
2526                                 pr_notice("md/raid10:%s: %s: failing drive\n",
2527                                        mdname(mddev),
2528                                        bdevname(rdev->bdev, b));
2529                                 break;
2530                         case 1:
2531                                 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
2532                                        mdname(mddev), s,
2533                                        (unsigned long long)(
2534                                                sect +
2535                                                choose_data_offset(r10_bio, rdev)),
2536                                        bdevname(rdev->bdev, b));
2537                                 atomic_add(s, &rdev->corrected_errors);
2538                         }
2539
2540                         rdev_dec_pending(rdev, mddev);
2541                         rcu_read_lock();
2542                 }
2543                 rcu_read_unlock();
2544
2545                 sectors -= s;
2546                 sect += s;
2547         }
2548 }
2549
2550 static int narrow_write_error(struct r10bio *r10_bio, int i)
2551 {
2552         struct bio *bio = r10_bio->master_bio;
2553         struct mddev *mddev = r10_bio->mddev;
2554         struct r10conf *conf = mddev->private;
2555         struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2556         /* bio has the data to be written to slot 'i' where
2557          * we just recently had a write error.
2558          * We repeatedly clone the bio and trim down to one block,
2559          * then try the write.  Where the write fails we record
2560          * a bad block.
2561          * It is conceivable that the bio doesn't exactly align with
2562          * blocks.  We must handle this.
2563          *
2564          * We currently own a reference to the rdev.
2565          */
2566
2567         int block_sectors;
2568         sector_t sector;
2569         int sectors;
2570         int sect_to_write = r10_bio->sectors;
2571         int ok = 1;
2572
2573         if (rdev->badblocks.shift < 0)
2574                 return 0;
2575
2576         block_sectors = roundup(1 << rdev->badblocks.shift,
2577                                 bdev_logical_block_size(rdev->bdev) >> 9);
2578         sector = r10_bio->sector;
2579         sectors = ((r10_bio->sector + block_sectors)
2580                    & ~(sector_t)(block_sectors - 1))
2581                 - sector;
2582
2583         while (sect_to_write) {
2584                 struct bio *wbio;
2585                 sector_t wsector;
2586                 if (sectors > sect_to_write)
2587                         sectors = sect_to_write;
2588                 /* Write at 'sector' for 'sectors' */
2589                 wbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
2590                 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
2591                 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
2592                 wbio->bi_iter.bi_sector = wsector +
2593                                    choose_data_offset(r10_bio, rdev);
2594                 bio_set_dev(wbio, rdev->bdev);
2595                 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
2596
2597                 if (submit_bio_wait(wbio) < 0)
2598                         /* Failure! */
2599                         ok = rdev_set_badblocks(rdev, wsector,
2600                                                 sectors, 0)
2601                                 && ok;
2602
2603                 bio_put(wbio);
2604                 sect_to_write -= sectors;
2605                 sector += sectors;
2606                 sectors = block_sectors;
2607         }
2608         return ok;
2609 }
2610
2611 static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2612 {
2613         int slot = r10_bio->read_slot;
2614         struct bio *bio;
2615         struct r10conf *conf = mddev->private;
2616         struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2617
2618         /* we got a read error. Maybe the drive is bad.  Maybe just
2619          * the block and we can fix it.
2620          * We freeze all other IO, and try reading the block from
2621          * other devices.  When we find one, we re-write
2622          * and check it that fixes the read error.
2623          * This is all done synchronously while the array is
2624          * frozen.
2625          */
2626         bio = r10_bio->devs[slot].bio;
2627         bio_put(bio);
2628         r10_bio->devs[slot].bio = NULL;
2629
2630         if (mddev->ro)
2631                 r10_bio->devs[slot].bio = IO_BLOCKED;
2632         else if (!test_bit(FailFast, &rdev->flags)) {
2633                 freeze_array(conf, 1);
2634                 fix_read_error(conf, mddev, r10_bio);
2635                 unfreeze_array(conf);
2636         } else
2637                 md_error(mddev, rdev);
2638
2639         rdev_dec_pending(rdev, mddev);
2640         allow_barrier(conf);
2641         r10_bio->state = 0;
2642         raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
2643 }
2644
2645 static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2646 {
2647         /* Some sort of write request has finished and it
2648          * succeeded in writing where we thought there was a
2649          * bad block.  So forget the bad block.
2650          * Or possibly if failed and we need to record
2651          * a bad block.
2652          */
2653         int m;
2654         struct md_rdev *rdev;
2655
2656         if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2657             test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2658                 for (m = 0; m < conf->copies; m++) {
2659                         int dev = r10_bio->devs[m].devnum;
2660                         rdev = conf->mirrors[dev].rdev;
2661                         if (r10_bio->devs[m].bio == NULL ||
2662                                 r10_bio->devs[m].bio->bi_end_io == NULL)
2663                                 continue;
2664                         if (!r10_bio->devs[m].bio->bi_status) {
2665                                 rdev_clear_badblocks(
2666                                         rdev,
2667                                         r10_bio->devs[m].addr,
2668                                         r10_bio->sectors, 0);
2669                         } else {
2670                                 if (!rdev_set_badblocks(
2671                                             rdev,
2672                                             r10_bio->devs[m].addr,
2673                                             r10_bio->sectors, 0))
2674                                         md_error(conf->mddev, rdev);
2675                         }
2676                         rdev = conf->mirrors[dev].replacement;
2677                         if (r10_bio->devs[m].repl_bio == NULL ||
2678                                 r10_bio->devs[m].repl_bio->bi_end_io == NULL)
2679                                 continue;
2680
2681                         if (!r10_bio->devs[m].repl_bio->bi_status) {
2682                                 rdev_clear_badblocks(
2683                                         rdev,
2684                                         r10_bio->devs[m].addr,
2685                                         r10_bio->sectors, 0);
2686                         } else {
2687                                 if (!rdev_set_badblocks(
2688                                             rdev,
2689                                             r10_bio->devs[m].addr,
2690                                             r10_bio->sectors, 0))
2691                                         md_error(conf->mddev, rdev);
2692                         }
2693                 }
2694                 put_buf(r10_bio);
2695         } else {
2696                 bool fail = false;
2697                 for (m = 0; m < conf->copies; m++) {
2698                         int dev = r10_bio->devs[m].devnum;
2699                         struct bio *bio = r10_bio->devs[m].bio;
2700                         rdev = conf->mirrors[dev].rdev;
2701                         if (bio == IO_MADE_GOOD) {
2702                                 rdev_clear_badblocks(
2703                                         rdev,
2704                                         r10_bio->devs[m].addr,
2705                                         r10_bio->sectors, 0);
2706                                 rdev_dec_pending(rdev, conf->mddev);
2707                         } else if (bio != NULL && bio->bi_status) {
2708                                 fail = true;
2709                                 if (!narrow_write_error(r10_bio, m)) {
2710                                         md_error(conf->mddev, rdev);
2711                                         set_bit(R10BIO_Degraded,
2712                                                 &r10_bio->state);
2713                                 }
2714                                 rdev_dec_pending(rdev, conf->mddev);
2715                         }
2716                         bio = r10_bio->devs[m].repl_bio;
2717                         rdev = conf->mirrors[dev].replacement;
2718                         if (rdev && bio == IO_MADE_GOOD) {
2719                                 rdev_clear_badblocks(
2720                                         rdev,
2721                                         r10_bio->devs[m].addr,
2722                                         r10_bio->sectors, 0);
2723                                 rdev_dec_pending(rdev, conf->mddev);
2724                         }
2725                 }
2726                 if (fail) {
2727                         spin_lock_irq(&conf->device_lock);
2728                         list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
2729                         conf->nr_queued++;
2730                         spin_unlock_irq(&conf->device_lock);
2731                         /*
2732                          * In case freeze_array() is waiting for condition
2733                          * nr_pending == nr_queued + extra to be true.
2734                          */
2735                         wake_up(&conf->wait_barrier);
2736                         md_wakeup_thread(conf->mddev->thread);
2737                 } else {
2738                         if (test_bit(R10BIO_WriteError,
2739                                      &r10_bio->state))
2740                                 close_write(r10_bio);
2741                         raid_end_bio_io(r10_bio);
2742                 }
2743         }
2744 }
2745
2746 static void raid10d(struct md_thread *thread)
2747 {
2748         struct mddev *mddev = thread->mddev;
2749         struct r10bio *r10_bio;
2750         unsigned long flags;
2751         struct r10conf *conf = mddev->private;
2752         struct list_head *head = &conf->retry_list;
2753         struct blk_plug plug;
2754
2755         md_check_recovery(mddev);
2756
2757         if (!list_empty_careful(&conf->bio_end_io_list) &&
2758             !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2759                 LIST_HEAD(tmp);
2760                 spin_lock_irqsave(&conf->device_lock, flags);
2761                 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2762                         while (!list_empty(&conf->bio_end_io_list)) {
2763                                 list_move(conf->bio_end_io_list.prev, &tmp);
2764                                 conf->nr_queued--;
2765                         }
2766                 }
2767                 spin_unlock_irqrestore(&conf->device_lock, flags);
2768                 while (!list_empty(&tmp)) {
2769                         r10_bio = list_first_entry(&tmp, struct r10bio,
2770                                                    retry_list);
2771                         list_del(&r10_bio->retry_list);
2772                         if (mddev->degraded)
2773                                 set_bit(R10BIO_Degraded, &r10_bio->state);
2774
2775                         if (test_bit(R10BIO_WriteError,
2776                                      &r10_bio->state))
2777                                 close_write(r10_bio);
2778                         raid_end_bio_io(r10_bio);
2779                 }
2780         }
2781
2782         blk_start_plug(&plug);
2783         for (;;) {
2784
2785                 flush_pending_writes(conf);
2786
2787                 spin_lock_irqsave(&conf->device_lock, flags);
2788                 if (list_empty(head)) {
2789                         spin_unlock_irqrestore(&conf->device_lock, flags);
2790                         break;
2791                 }
2792                 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2793                 list_del(head->prev);
2794                 conf->nr_queued--;
2795                 spin_unlock_irqrestore(&conf->device_lock, flags);
2796
2797                 mddev = r10_bio->mddev;
2798                 conf = mddev->private;
2799                 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2800                     test_bit(R10BIO_WriteError, &r10_bio->state))
2801                         handle_write_completed(conf, r10_bio);
2802                 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2803                         reshape_request_write(mddev, r10_bio);
2804                 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2805                         sync_request_write(mddev, r10_bio);
2806                 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2807                         recovery_request_write(mddev, r10_bio);
2808                 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2809                         handle_read_error(mddev, r10_bio);
2810                 else
2811                         WARN_ON_ONCE(1);
2812
2813                 cond_resched();
2814                 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
2815                         md_check_recovery(mddev);
2816         }
2817         blk_finish_plug(&plug);
2818 }
2819
2820 static int init_resync(struct r10conf *conf)
2821 {
2822         int ret, buffs, i;
2823
2824         buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2825         BUG_ON(mempool_initialized(&conf->r10buf_pool));
2826         conf->have_replacement = 0;
2827         for (i = 0; i < conf->geo.raid_disks; i++)
2828                 if (conf->mirrors[i].replacement)
2829                         conf->have_replacement = 1;
2830         ret = mempool_init(&conf->r10buf_pool, buffs,
2831                            r10buf_pool_alloc, r10buf_pool_free, conf);
2832         if (ret)
2833                 return ret;
2834         conf->next_resync = 0;
2835         return 0;
2836 }
2837
2838 static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
2839 {
2840         struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO);
2841         struct rsync_pages *rp;
2842         struct bio *bio;
2843         int nalloc;
2844         int i;
2845
2846         if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
2847             test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
2848                 nalloc = conf->copies; /* resync */
2849         else
2850                 nalloc = 2; /* recovery */
2851
2852         for (i = 0; i < nalloc; i++) {
2853                 bio = r10bio->devs[i].bio;
2854                 rp = bio->bi_private;
2855                 bio_reset(bio);
2856                 bio->bi_private = rp;
2857                 bio = r10bio->devs[i].repl_bio;
2858                 if (bio) {
2859                         rp = bio->bi_private;
2860                         bio_reset(bio);
2861                         bio->bi_private = rp;
2862                 }
2863         }
2864         return r10bio;
2865 }
2866
2867 /*
2868  * Set cluster_sync_high since we need other nodes to add the
2869  * range [cluster_sync_low, cluster_sync_high] to suspend list.
2870  */
2871 static void raid10_set_cluster_sync_high(struct r10conf *conf)
2872 {
2873         sector_t window_size;
2874         int extra_chunk, chunks;
2875
2876         /*
2877          * First, here we define "stripe" as a unit which across
2878          * all member devices one time, so we get chunks by use
2879          * raid_disks / near_copies. Otherwise, if near_copies is
2880          * close to raid_disks, then resync window could increases
2881          * linearly with the increase of raid_disks, which means
2882          * we will suspend a really large IO window while it is not
2883          * necessary. If raid_disks is not divisible by near_copies,
2884          * an extra chunk is needed to ensure the whole "stripe" is
2885          * covered.
2886          */
2887
2888         chunks = conf->geo.raid_disks / conf->geo.near_copies;
2889         if (conf->geo.raid_disks % conf->geo.near_copies == 0)
2890                 extra_chunk = 0;
2891         else
2892                 extra_chunk = 1;
2893         window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
2894
2895         /*
2896          * At least use a 32M window to align with raid1's resync window
2897          */
2898         window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
2899                         CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
2900
2901         conf->cluster_sync_high = conf->cluster_sync_low + window_size;
2902 }
2903
2904 /*
2905  * perform a "sync" on one "block"
2906  *
2907  * We need to make sure that no normal I/O request - particularly write
2908  * requests - conflict with active sync requests.
2909  *
2910  * This is achieved by tracking pending requests and a 'barrier' concept
2911  * that can be installed to exclude normal IO requests.
2912  *
2913  * Resync and recovery are handled very differently.
2914  * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
2915  *
2916  * For resync, we iterate over virtual addresses, read all copies,
2917  * and update if there are differences.  If only one copy is live,
2918  * skip it.
2919  * For recovery, we iterate over physical addresses, read a good
2920  * value for each non-in_sync drive, and over-write.
2921  *
2922  * So, for recovery we may have several outstanding complex requests for a
2923  * given address, one for each out-of-sync device.  We model this by allocating
2924  * a number of r10_bio structures, one for each out-of-sync device.
2925  * As we setup these structures, we collect all bio's together into a list
2926  * which we then process collectively to add pages, and then process again
2927  * to pass to generic_make_request.
2928  *
2929  * The r10_bio structures are linked using a borrowed master_bio pointer.
2930  * This link is counted in ->remaining.  When the r10_bio that points to NULL
2931  * has its remaining count decremented to 0, the whole complex operation
2932  * is complete.
2933  *
2934  */
2935
2936 static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2937                              int *skipped)
2938 {
2939         struct r10conf *conf = mddev->private;
2940         struct r10bio *r10_bio;
2941         struct bio *biolist = NULL, *bio;
2942         sector_t max_sector, nr_sectors;
2943         int i;
2944         int max_sync;
2945         sector_t sync_blocks;
2946         sector_t sectors_skipped = 0;
2947         int chunks_skipped = 0;
2948         sector_t chunk_mask = conf->geo.chunk_mask;
2949         int page_idx = 0;
2950
2951         if (!mempool_initialized(&conf->r10buf_pool))
2952                 if (init_resync(conf))
2953                         return 0;
2954
2955         /*
2956          * Allow skipping a full rebuild for incremental assembly
2957          * of a clean array, like RAID1 does.
2958          */
2959         if (mddev->bitmap == NULL &&
2960             mddev->recovery_cp == MaxSector &&
2961             mddev->reshape_position == MaxSector &&
2962             !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2963             !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2964             !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2965             conf->fullsync == 0) {
2966                 *skipped = 1;
2967                 return mddev->dev_sectors - sector_nr;
2968         }
2969
2970  skipped:
2971         max_sector = mddev->dev_sectors;
2972         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2973             test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2974                 max_sector = mddev->resync_max_sectors;
2975         if (sector_nr >= max_sector) {
2976                 conf->cluster_sync_low = 0;
2977                 conf->cluster_sync_high = 0;
2978
2979                 /* If we aborted, we need to abort the
2980                  * sync on the 'current' bitmap chucks (there can
2981                  * be several when recovering multiple devices).
2982                  * as we may have started syncing it but not finished.
2983                  * We can find the current address in
2984                  * mddev->curr_resync, but for recovery,
2985                  * we need to convert that to several
2986                  * virtual addresses.
2987                  */
2988                 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2989                         end_reshape(conf);
2990                         close_sync(conf);
2991                         return 0;
2992                 }
2993
2994                 if (mddev->curr_resync < max_sector) { /* aborted */
2995                         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2996                                 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2997                                                    &sync_blocks, 1);
2998                         else for (i = 0; i < conf->geo.raid_disks; i++) {
2999                                 sector_t sect =
3000                                         raid10_find_virt(conf, mddev->curr_resync, i);
3001                                 md_bitmap_end_sync(mddev->bitmap, sect,
3002                                                    &sync_blocks, 1);
3003                         }
3004                 } else {
3005                         /* completed sync */
3006                         if ((!mddev->bitmap || conf->fullsync)
3007                             && conf->have_replacement
3008                             && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3009                                 /* Completed a full sync so the replacements
3010                                  * are now fully recovered.
3011                                  */
3012                                 rcu_read_lock();
3013                                 for (i = 0; i < conf->geo.raid_disks; i++) {
3014                                         struct md_rdev *rdev =
3015                                                 rcu_dereference(conf->mirrors[i].replacement);
3016                                         if (rdev)
3017                                                 rdev->recovery_offset = MaxSector;
3018                                 }
3019                                 rcu_read_unlock();
3020                         }
3021                         conf->fullsync = 0;
3022                 }
3023                 md_bitmap_close_sync(mddev->bitmap);
3024                 close_sync(conf);
3025                 *skipped = 1;
3026                 return sectors_skipped;
3027         }
3028
3029         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3030                 return reshape_request(mddev, sector_nr, skipped);
3031
3032         if (chunks_skipped >= conf->geo.raid_disks) {
3033                 /* if there has been nothing to do on any drive,
3034                  * then there is nothing to do at all..
3035                  */
3036                 *skipped = 1;
3037                 return (max_sector - sector_nr) + sectors_skipped;
3038         }
3039
3040         if (max_sector > mddev->resync_max)
3041                 max_sector = mddev->resync_max; /* Don't do IO beyond here */
3042
3043         /* make sure whole request will fit in a chunk - if chunks
3044          * are meaningful
3045          */
3046         if (conf->geo.near_copies < conf->geo.raid_disks &&
3047             max_sector > (sector_nr | chunk_mask))
3048                 max_sector = (sector_nr | chunk_mask) + 1;
3049
3050         /*
3051          * If there is non-resync activity waiting for a turn, then let it
3052          * though before starting on this new sync request.
3053          */
3054         if (conf->nr_waiting)
3055                 schedule_timeout_uninterruptible(1);
3056
3057         /* Again, very different code for resync and recovery.
3058          * Both must result in an r10bio with a list of bios that
3059          * have bi_end_io, bi_sector, bi_disk set,
3060          * and bi_private set to the r10bio.
3061          * For recovery, we may actually create several r10bios
3062          * with 2 bios in each, that correspond to the bios in the main one.
3063          * In this case, the subordinate r10bios link back through a
3064          * borrowed master_bio pointer, and the counter in the master
3065          * includes a ref from each subordinate.
3066          */
3067         /* First, we decide what to do and set ->bi_end_io
3068          * To end_sync_read if we want to read, and
3069          * end_sync_write if we will want to write.
3070          */
3071
3072         max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3073         if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3074                 /* recovery... the complicated one */
3075                 int j;
3076                 r10_bio = NULL;
3077
3078                 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3079                         int still_degraded;
3080                         struct r10bio *rb2;
3081                         sector_t sect;
3082                         int must_sync;
3083                         int any_working;
3084                         int need_recover = 0;
3085                         int need_replace = 0;
3086                         struct raid10_info *mirror = &conf->mirrors[i];
3087                         struct md_rdev *mrdev, *mreplace;
3088
3089                         rcu_read_lock();
3090                         mrdev = rcu_dereference(mirror->rdev);
3091                         mreplace = rcu_dereference(mirror->replacement);
3092
3093                         if (mrdev != NULL &&
3094                             !test_bit(Faulty, &mrdev->flags) &&
3095                             !test_bit(In_sync, &mrdev->flags))
3096                                 need_recover = 1;
3097                         if (mreplace != NULL &&
3098                             !test_bit(Faulty, &mreplace->flags))
3099                                 need_replace = 1;
3100
3101                         if (!need_recover && !need_replace) {
3102                                 rcu_read_unlock();
3103                                 continue;
3104                         }
3105
3106                         still_degraded = 0;
3107                         /* want to reconstruct this device */
3108                         rb2 = r10_bio;
3109                         sect = raid10_find_virt(conf, sector_nr, i);
3110                         if (sect >= mddev->resync_max_sectors) {
3111                                 /* last stripe is not complete - don't
3112                                  * try to recover this sector.
3113                                  */
3114                                 rcu_read_unlock();
3115                                 continue;
3116                         }
3117                         if (mreplace && test_bit(Faulty, &mreplace->flags))
3118                                 mreplace = NULL;
3119                         /* Unless we are doing a full sync, or a replacement
3120                          * we only need to recover the block if it is set in
3121                          * the bitmap
3122                          */
3123                         must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
3124                                                          &sync_blocks, 1);
3125                         if (sync_blocks < max_sync)
3126                                 max_sync = sync_blocks;
3127                         if (!must_sync &&
3128                             mreplace == NULL &&
3129                             !conf->fullsync) {
3130                                 /* yep, skip the sync_blocks here, but don't assume
3131                                  * that there will never be anything to do here
3132                                  */
3133                                 chunks_skipped = -1;
3134                                 rcu_read_unlock();
3135                                 continue;
3136                         }
3137                         atomic_inc(&mrdev->nr_pending);
3138                         if (mreplace)
3139                                 atomic_inc(&mreplace->nr_pending);
3140                         rcu_read_unlock();
3141
3142                         r10_bio = raid10_alloc_init_r10buf(conf);
3143                         r10_bio->state = 0;
3144                         raise_barrier(conf, rb2 != NULL);
3145                         atomic_set(&r10_bio->remaining, 0);
3146
3147                         r10_bio->master_bio = (struct bio*)rb2;
3148                         if (rb2)
3149                                 atomic_inc(&rb2->remaining);
3150                         r10_bio->mddev = mddev;
3151                         set_bit(R10BIO_IsRecover, &r10_bio->state);
3152                         r10_bio->sector = sect;
3153
3154                         raid10_find_phys(conf, r10_bio);
3155
3156                         /* Need to check if the array will still be
3157                          * degraded
3158                          */
3159                         rcu_read_lock();
3160                         for (j = 0; j < conf->geo.raid_disks; j++) {
3161                                 struct md_rdev *rdev = rcu_dereference(
3162                                         conf->mirrors[j].rdev);
3163                                 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3164                                         still_degraded = 1;
3165                                         break;
3166                                 }
3167                         }
3168
3169                         must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
3170                                                          &sync_blocks, still_degraded);
3171
3172                         any_working = 0;
3173                         for (j=0; j<conf->copies;j++) {
3174                                 int k;
3175                                 int d = r10_bio->devs[j].devnum;
3176                                 sector_t from_addr, to_addr;
3177                                 struct md_rdev *rdev =
3178                                         rcu_dereference(conf->mirrors[d].rdev);
3179                                 sector_t sector, first_bad;
3180                                 int bad_sectors;
3181                                 if (!rdev ||
3182                                     !test_bit(In_sync, &rdev->flags))
3183                                         continue;
3184                                 /* This is where we read from */
3185                                 any_working = 1;
3186                                 sector = r10_bio->devs[j].addr;
3187
3188                                 if (is_badblock(rdev, sector, max_sync,
3189                                                 &first_bad, &bad_sectors)) {
3190                                         if (first_bad > sector)
3191                                                 max_sync = first_bad - sector;
3192                                         else {
3193                                                 bad_sectors -= (sector
3194                                                                 - first_bad);
3195                                                 if (max_sync > bad_sectors)
3196                                                         max_sync = bad_sectors;
3197                                                 continue;
3198                                         }
3199                                 }
3200                                 bio = r10_bio->devs[0].bio;
3201                                 bio->bi_next = biolist;
3202                                 biolist = bio;
3203                                 bio->bi_end_io = end_sync_read;
3204                                 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3205                                 if (test_bit(FailFast, &rdev->flags))
3206                                         bio->bi_opf |= MD_FAILFAST;
3207                                 from_addr = r10_bio->devs[j].addr;
3208                                 bio->bi_iter.bi_sector = from_addr +
3209                                         rdev->data_offset;
3210                                 bio_set_dev(bio, rdev->bdev);
3211                                 atomic_inc(&rdev->nr_pending);
3212                                 /* and we write to 'i' (if not in_sync) */
3213
3214                                 for (k=0; k<conf->copies; k++)
3215                                         if (r10_bio->devs[k].devnum == i)
3216                                                 break;
3217                                 BUG_ON(k == conf->copies);
3218                                 to_addr = r10_bio->devs[k].addr;
3219                                 r10_bio->devs[0].devnum = d;
3220                                 r10_bio->devs[0].addr = from_addr;
3221                                 r10_bio->devs[1].devnum = i;
3222                                 r10_bio->devs[1].addr = to_addr;
3223
3224                                 if (need_recover) {
3225                                         bio = r10_bio->devs[1].bio;
3226                                         bio->bi_next = biolist;
3227                                         biolist = bio;
3228                                         bio->bi_end_io = end_sync_write;
3229                                         bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3230                                         bio->bi_iter.bi_sector = to_addr
3231                                                 + mrdev->data_offset;
3232                                         bio_set_dev(bio, mrdev->bdev);
3233                                         atomic_inc(&r10_bio->remaining);
3234                                 } else
3235                                         r10_bio->devs[1].bio->bi_end_io = NULL;
3236
3237                                 /* and maybe write to replacement */
3238                                 bio = r10_bio->devs[1].repl_bio;
3239                                 if (bio)
3240                                         bio->bi_end_io = NULL;
3241                                 /* Note: if need_replace, then bio
3242                                  * cannot be NULL as r10buf_pool_alloc will
3243                                  * have allocated it.
3244                                  */
3245                                 if (!need_replace)
3246                                         break;
3247                                 bio->bi_next = biolist;
3248                                 biolist = bio;
3249                                 bio->bi_end_io = end_sync_write;
3250                                 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3251                                 bio->bi_iter.bi_sector = to_addr +
3252                                         mreplace->data_offset;
3253                                 bio_set_dev(bio, mreplace->bdev);
3254                                 atomic_inc(&r10_bio->remaining);
3255                                 break;
3256                         }
3257                         rcu_read_unlock();
3258                         if (j == conf->copies) {
3259                                 /* Cannot recover, so abort the recovery or
3260                                  * record a bad block */
3261                                 if (any_working) {
3262                                         /* problem is that there are bad blocks
3263                                          * on other device(s)
3264                                          */
3265                                         int k;
3266                                         for (k = 0; k < conf->copies; k++)
3267                                                 if (r10_bio->devs[k].devnum == i)
3268                                                         break;
3269                                         if (!test_bit(In_sync,
3270                                                       &mrdev->flags)
3271                                             && !rdev_set_badblocks(
3272                                                     mrdev,
3273                                                     r10_bio->devs[k].addr,
3274                                                     max_sync, 0))
3275                                                 any_working = 0;
3276                                         if (mreplace &&
3277                                             !rdev_set_badblocks(
3278                                                     mreplace,
3279                                                     r10_bio->devs[k].addr,
3280                                                     max_sync, 0))
3281                                                 any_working = 0;
3282                                 }
3283                                 if (!any_working)  {
3284                                         if (!test_and_set_bit(MD_RECOVERY_INTR,
3285                                                               &mddev->recovery))
3286                                                 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
3287                                                        mdname(mddev));
3288                                         mirror->recovery_disabled
3289                                                 = mddev->recovery_disabled;
3290                                 }
3291                                 put_buf(r10_bio);
3292                                 if (rb2)
3293                                         atomic_dec(&rb2->remaining);
3294                                 r10_bio = rb2;
3295                                 rdev_dec_pending(mrdev, mddev);
3296                                 if (mreplace)
3297                                         rdev_dec_pending(mreplace, mddev);
3298                                 break;
3299                         }
3300                         rdev_dec_pending(mrdev, mddev);
3301                         if (mreplace)
3302                                 rdev_dec_pending(mreplace, mddev);
3303                         if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
3304                                 /* Only want this if there is elsewhere to
3305                                  * read from. 'j' is currently the first
3306                                  * readable copy.
3307                                  */
3308                                 int targets = 1;
3309                                 for (; j < conf->copies; j++) {
3310                                         int d = r10_bio->devs[j].devnum;
3311                                         if (conf->mirrors[d].rdev &&
3312                                             test_bit(In_sync,
3313                                                       &conf->mirrors[d].rdev->flags))
3314                                                 targets++;
3315                                 }
3316                                 if (targets == 1)
3317                                         r10_bio->devs[0].bio->bi_opf
3318                                                 &= ~MD_FAILFAST;
3319                         }
3320                 }
3321                 if (biolist == NULL) {
3322                         while (r10_bio) {
3323                                 struct r10bio *rb2 = r10_bio;
3324                                 r10_bio = (struct r10bio*) rb2->master_bio;
3325                                 rb2->master_bio = NULL;
3326                                 put_buf(rb2);
3327                         }
3328                         goto giveup;
3329                 }
3330         } else {
3331                 /* resync. Schedule a read for every block at this virt offset */
3332                 int count = 0;
3333
3334                 /*
3335                  * Since curr_resync_completed could probably not update in
3336                  * time, and we will set cluster_sync_low based on it.
3337                  * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for
3338                  * safety reason, which ensures curr_resync_completed is
3339                  * updated in bitmap_cond_end_sync.
3340                  */
3341                 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
3342                                         mddev_is_clustered(mddev) &&
3343                                         (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
3344
3345                 if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
3346                                           &sync_blocks, mddev->degraded) &&
3347                     !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3348                                                  &mddev->recovery)) {
3349                         /* We can skip this block */
3350                         *skipped = 1;
3351                         return sync_blocks + sectors_skipped;
3352                 }
3353                 if (sync_blocks < max_sync)
3354                         max_sync = sync_blocks;
3355                 r10_bio = raid10_alloc_init_r10buf(conf);
3356                 r10_bio->state = 0;
3357
3358                 r10_bio->mddev = mddev;
3359                 atomic_set(&r10_bio->remaining, 0);
3360                 raise_barrier(conf, 0);
3361                 conf->next_resync = sector_nr;
3362
3363                 r10_bio->master_bio = NULL;
3364                 r10_bio->sector = sector_nr;
3365                 set_bit(R10BIO_IsSync, &r10_bio->state);
3366                 raid10_find_phys(conf, r10_bio);
3367                 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3368
3369                 for (i = 0; i < conf->copies; i++) {
3370                         int d = r10_bio->devs[i].devnum;
3371                         sector_t first_bad, sector;
3372                         int bad_sectors;
3373                         struct md_rdev *rdev;
3374
3375                         if (r10_bio->devs[i].repl_bio)
3376                                 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3377
3378                         bio = r10_bio->devs[i].bio;
3379                         bio->bi_status = BLK_STS_IOERR;
3380                         rcu_read_lock();
3381                         rdev = rcu_dereference(conf->mirrors[d].rdev);
3382                         if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3383                                 rcu_read_unlock();
3384                                 continue;
3385                         }
3386                         sector = r10_bio->devs[i].addr;
3387                         if (is_badblock(rdev, sector, max_sync,
3388                                         &first_bad, &bad_sectors)) {
3389                                 if (first_bad > sector)
3390                                         max_sync = first_bad - sector;
3391                                 else {
3392                                         bad_sectors -= (sector - first_bad);
3393                                         if (max_sync > bad_sectors)
3394                                                 max_sync = bad_sectors;
3395                                         rcu_read_unlock();
3396                                         continue;
3397                                 }
3398                         }
3399                         atomic_inc(&rdev->nr_pending);
3400                         atomic_inc(&r10_bio->remaining);
3401                         bio->bi_next = biolist;
3402                         biolist = bio;
3403                         bio->bi_end_io = end_sync_read;
3404                         bio_set_op_attrs(bio, REQ_OP_READ, 0);
3405                         if (test_bit(FailFast, &rdev->flags))
3406                                 bio->bi_opf |= MD_FAILFAST;
3407                         bio->bi_iter.bi_sector = sector + rdev->data_offset;
3408                         bio_set_dev(bio, rdev->bdev);
3409                         count++;
3410
3411                         rdev = rcu_dereference(conf->mirrors[d].replacement);
3412                         if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3413                                 rcu_read_unlock();
3414                                 continue;
3415                         }
3416                         atomic_inc(&rdev->nr_pending);
3417
3418                         /* Need to set up for writing to the replacement */
3419                         bio = r10_bio->devs[i].repl_bio;
3420                         bio->bi_status = BLK_STS_IOERR;
3421
3422                         sector = r10_bio->devs[i].addr;
3423                         bio->bi_next = biolist;
3424                         biolist = bio;
3425                         bio->bi_end_io = end_sync_write;
3426                         bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3427                         if (test_bit(FailFast, &rdev->flags))
3428                                 bio->bi_opf |= MD_FAILFAST;
3429                         bio->bi_iter.bi_sector = sector + rdev->data_offset;
3430                         bio_set_dev(bio, rdev->bdev);
3431                         count++;
3432                         rcu_read_unlock();
3433                 }
3434
3435                 if (count < 2) {
3436                         for (i=0; i<conf->copies; i++) {
3437                                 int d = r10_bio->devs[i].devnum;
3438                                 if (r10_bio->devs[i].bio->bi_end_io)
3439                                         rdev_dec_pending(conf->mirrors[d].rdev,
3440                                                          mddev);
3441                                 if (r10_bio->devs[i].repl_bio &&
3442                                     r10_bio->devs[i].repl_bio->bi_end_io)
3443                                         rdev_dec_pending(
3444                                                 conf->mirrors[d].replacement,
3445                                                 mddev);
3446                         }
3447                         put_buf(r10_bio);
3448                         biolist = NULL;
3449                         goto giveup;
3450                 }
3451         }
3452
3453         nr_sectors = 0;
3454         if (sector_nr + max_sync < max_sector)
3455                 max_sector = sector_nr + max_sync;
3456         do {
3457                 struct page *page;
3458                 int len = PAGE_SIZE;
3459                 if (sector_nr + (len>>9) > max_sector)
3460                         len = (max_sector - sector_nr) << 9;
3461                 if (len == 0)
3462                         break;
3463                 for (bio= biolist ; bio ; bio=bio->bi_next) {
3464                         struct resync_pages *rp = get_resync_pages(bio);
3465                         page = resync_fetch_page(rp, page_idx);
3466                         /*
3467                          * won't fail because the vec table is big enough
3468                          * to hold all these pages
3469                          */
3470                         bio_add_page(bio, page, len, 0);
3471                 }
3472                 nr_sectors += len>>9;
3473                 sector_nr += len>>9;
3474         } while (++page_idx < RESYNC_PAGES);
3475         r10_bio->sectors = nr_sectors;
3476
3477         if (mddev_is_clustered(mddev) &&
3478             test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3479                 /* It is resync not recovery */
3480                 if (conf->cluster_sync_high < sector_nr + nr_sectors) {
3481                         conf->cluster_sync_low = mddev->curr_resync_completed;
3482                         raid10_set_cluster_sync_high(conf);
3483                         /* Send resync message */
3484                         md_cluster_ops->resync_info_update(mddev,
3485                                                 conf->cluster_sync_low,
3486                                                 conf->cluster_sync_high);
3487                 }
3488         } else if (mddev_is_clustered(mddev)) {
3489                 /* This is recovery not resync */
3490                 sector_t sect_va1, sect_va2;
3491                 bool broadcast_msg = false;
3492
3493                 for (i = 0; i < conf->geo.raid_disks; i++) {
3494                         /*
3495                          * sector_nr is a device address for recovery, so we
3496                          * need translate it to array address before compare
3497                          * with cluster_sync_high.
3498                          */
3499                         sect_va1 = raid10_find_virt(conf, sector_nr, i);
3500
3501                         if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
3502                                 broadcast_msg = true;
3503                                 /*
3504                                  * curr_resync_completed is similar as
3505                                  * sector_nr, so make the translation too.
3506                                  */
3507                                 sect_va2 = raid10_find_virt(conf,
3508                                         mddev->curr_resync_completed, i);
3509
3510                                 if (conf->cluster_sync_low == 0 ||
3511                                     conf->cluster_sync_low > sect_va2)
3512                                         conf->cluster_sync_low = sect_va2;
3513                         }
3514                 }
3515                 if (broadcast_msg) {
3516                         raid10_set_cluster_sync_high(conf);
3517                         md_cluster_ops->resync_info_update(mddev,
3518                                                 conf->cluster_sync_low,
3519                                                 conf->cluster_sync_high);
3520                 }
3521         }
3522
3523         while (biolist) {
3524                 bio = biolist;
3525                 biolist = biolist->bi_next;
3526
3527                 bio->bi_next = NULL;
3528                 r10_bio = get_resync_r10bio(bio);
3529                 r10_bio->sectors = nr_sectors;
3530
3531                 if (bio->bi_end_io == end_sync_read) {
3532                         md_sync_acct_bio(bio, nr_sectors);
3533                         bio->bi_status = 0;
3534                         generic_make_request(bio);
3535                 }
3536         }
3537
3538         if (sectors_skipped)
3539                 /* pretend they weren't skipped, it makes
3540                  * no important difference in this case
3541                  */
3542                 md_done_sync(mddev, sectors_skipped, 1);
3543
3544         return sectors_skipped + nr_sectors;
3545  giveup:
3546         /* There is nowhere to write, so all non-sync
3547          * drives must be failed or in resync, all drives
3548          * have a bad block, so try the next chunk...
3549          */
3550         if (sector_nr + max_sync < max_sector)
3551                 max_sector = sector_nr + max_sync;
3552
3553         sectors_skipped += (max_sector - sector_nr);
3554         chunks_skipped ++;
3555         sector_nr = max_sector;
3556         goto skipped;
3557 }
3558
3559 static sector_t
3560 raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3561 {
3562         sector_t size;
3563         struct r10conf *conf = mddev->private;
3564
3565         if (!raid_disks)
3566                 raid_disks = min(conf->geo.raid_disks,
3567                                  conf->prev.raid_disks);
3568         if (!sectors)
3569                 sectors = conf->dev_sectors;
3570
3571         size = sectors >> conf->geo.chunk_shift;
3572         sector_div(size, conf->geo.far_copies);
3573         size = size * raid_disks;
3574         sector_div(size, conf->geo.near_copies);
3575
3576         return size << conf->geo.chunk_shift;
3577 }
3578
3579 static void calc_sectors(struct r10conf *conf, sector_t size)
3580 {
3581         /* Calculate the number of sectors-per-device that will
3582          * actually be used, and set conf->dev_sectors and
3583          * conf->stride
3584          */
3585
3586         size = size >> conf->geo.chunk_shift;
3587         sector_div(size, conf->geo.far_copies);
3588         size = size * conf->geo.raid_disks;
3589         sector_div(size, conf->geo.near_copies);
3590         /* 'size' is now the number of chunks in the array */
3591         /* calculate "used chunks per device" */
3592         size = size * conf->copies;
3593
3594         /* We need to round up when dividing by raid_disks to
3595          * get the stride size.
3596          */
3597         size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3598
3599         conf->dev_sectors = size << conf->geo.chunk_shift;
3600
3601         if (conf->geo.far_offset)
3602                 conf->geo.stride = 1 << conf->geo.chunk_shift;
3603         else {
3604                 sector_div(size, conf->geo.far_copies);
3605                 conf->geo.stride = size << conf->geo.chunk_shift;
3606         }
3607 }
3608
3609 enum geo_type {geo_new, geo_old, geo_start};
3610 static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3611 {
3612         int nc, fc, fo;
3613         int layout, chunk, disks;
3614         switch (new) {
3615         case geo_old:
3616                 layout = mddev->layout;
3617                 chunk = mddev->chunk_sectors;
3618                 disks = mddev->raid_disks - mddev->delta_disks;
3619                 break;
3620         case geo_new:
3621                 layout = mddev->new_layout;
3622                 chunk = mddev->new_chunk_sectors;
3623                 disks = mddev->raid_disks;
3624                 break;
3625         default: /* avoid 'may be unused' warnings */
3626         case geo_start: /* new when starting reshape - raid_disks not
3627                          * updated yet. */
3628                 layout = mddev->new_layout;
3629                 chunk = mddev->new_chunk_sectors;
3630                 disks = mddev->raid_disks + mddev->delta_disks;
3631                 break;
3632         }
3633         if (layout >> 19)
3634                 return -1;
3635         if (chunk < (PAGE_SIZE >> 9) ||
3636             !is_power_of_2(chunk))
3637                 return -2;
3638         nc = layout & 255;
3639         fc = (layout >> 8) & 255;
3640         fo = layout & (1<<16);
3641         geo->raid_disks = disks;
3642         geo->near_copies = nc;
3643         geo->far_copies = fc;
3644         geo->far_offset = fo;
3645         switch (layout >> 17) {
3646         case 0: /* original layout.  simple but not always optimal */
3647                 geo->far_set_size = disks;
3648                 break;
3649         case 1: /* "improved" layout which was buggy.  Hopefully no-one is
3650                  * actually using this, but leave code here just in case.*/
3651                 geo->far_set_size = disks/fc;
3652                 WARN(geo->far_set_size < fc,
3653                      "This RAID10 layout does not provide data safety - please backup and create new array\n");
3654                 break;
3655         case 2: /* "improved" layout fixed to match documentation */
3656                 geo->far_set_size = fc * nc;
3657                 break;
3658         default: /* Not a valid layout */
3659                 return -1;
3660         }
3661         geo->chunk_mask = chunk - 1;
3662         geo->chunk_shift = ffz(~chunk);
3663         return nc*fc;
3664 }
3665
3666 static struct r10conf *setup_conf(struct mddev *mddev)
3667 {
3668         struct r10conf *conf = NULL;
3669         int err = -EINVAL;
3670         struct geom geo;
3671         int copies;
3672
3673         copies = setup_geo(&geo, mddev, geo_new);
3674
3675         if (copies == -2) {
3676                 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n",
3677                         mdname(mddev), PAGE_SIZE);
3678                 goto out;
3679         }
3680
3681         if (copies < 2 || copies > mddev->raid_disks) {
3682                 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3683                         mdname(mddev), mddev->new_layout);
3684                 goto out;
3685         }
3686
3687         err = -ENOMEM;
3688         conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3689         if (!conf)
3690                 goto out;
3691
3692         /* FIXME calc properly */
3693         conf->mirrors = kcalloc(mddev->raid_disks + max(0, -mddev->delta_disks),
3694                                 sizeof(struct raid10_info),
3695                                 GFP_KERNEL);
3696         if (!conf->mirrors)
3697                 goto out;
3698
3699         conf->tmppage = alloc_page(GFP_KERNEL);
3700         if (!conf->tmppage)
3701                 goto out;
3702
3703         conf->geo = geo;
3704         conf->copies = copies;
3705         err = mempool_init(&conf->r10bio_pool, NR_RAID10_BIOS, r10bio_pool_alloc,
3706                            r10bio_pool_free, conf);
3707         if (err)
3708                 goto out;
3709
3710         err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
3711         if (err)
3712                 goto out;
3713
3714         calc_sectors(conf, mddev->dev_sectors);
3715         if (mddev->reshape_position == MaxSector) {
3716                 conf->prev = conf->geo;
3717                 conf->reshape_progress = MaxSector;
3718         } else {
3719                 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3720                         err = -EINVAL;
3721                         goto out;
3722                 }
3723                 conf->reshape_progress = mddev->reshape_position;
3724                 if (conf->prev.far_offset)
3725                         conf->prev.stride = 1 << conf->prev.chunk_shift;
3726                 else
3727                         /* far_copies must be 1 */
3728                         conf->prev.stride = conf->dev_sectors;
3729         }
3730         conf->reshape_safe = conf->reshape_progress;
3731         spin_lock_init(&conf->device_lock);
3732         INIT_LIST_HEAD(&conf->retry_list);
3733         INIT_LIST_HEAD(&conf->bio_end_io_list);
3734
3735         spin_lock_init(&conf->resync_lock);
3736         init_waitqueue_head(&conf->wait_barrier);
3737         atomic_set(&conf->nr_pending, 0);
3738
3739         err = -ENOMEM;
3740         conf->thread = md_register_thread(raid10d, mddev, "raid10");
3741         if (!conf->thread)
3742                 goto out;
3743
3744         conf->mddev = mddev;
3745         return conf;
3746
3747  out:
3748         if (conf) {
3749                 mempool_exit(&conf->r10bio_pool);
3750                 kfree(conf->mirrors);
3751                 safe_put_page(conf->tmppage);
3752                 bioset_exit(&conf->bio_split);
3753                 kfree(conf);
3754         }
3755         return ERR_PTR(err);
3756 }
3757
3758 static int raid10_run(struct mddev *mddev)
3759 {
3760         struct r10conf *conf;
3761         int i, disk_idx, chunk_size;
3762         struct raid10_info *disk;
3763         struct md_rdev *rdev;
3764         sector_t size;
3765         sector_t min_offset_diff = 0;
3766         int first = 1;
3767         bool discard_supported = false;
3768
3769         if (mddev_init_writes_pending(mddev) < 0)
3770                 return -ENOMEM;
3771
3772         if (mddev->private == NULL) {
3773                 conf = setup_conf(mddev);
3774                 if (IS_ERR(conf))
3775                         return PTR_ERR(conf);
3776                 mddev->private = conf;
3777         }
3778         conf = mddev->private;
3779         if (!conf)
3780                 goto out;
3781
3782         if (mddev_is_clustered(conf->mddev)) {
3783                 int fc, fo;
3784
3785                 fc = (mddev->layout >> 8) & 255;
3786                 fo = mddev->layout & (1<<16);
3787                 if (fc > 1 || fo > 0) {
3788                         pr_err("only near layout is supported by clustered"
3789                                 " raid10\n");
3790                         goto out_free_conf;
3791                 }
3792         }
3793
3794         mddev->thread = conf->thread;
3795         conf->thread = NULL;
3796
3797         chunk_size = mddev->chunk_sectors << 9;
3798         if (mddev->queue) {
3799                 blk_queue_max_discard_sectors(mddev->queue,
3800                                               mddev->chunk_sectors);
3801                 blk_queue_max_write_same_sectors(mddev->queue, 0);
3802                 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
3803                 blk_queue_io_min(mddev->queue, chunk_size);
3804                 if (conf->geo.raid_disks % conf->geo.near_copies)
3805                         blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3806                 else
3807                         blk_queue_io_opt(mddev->queue, chunk_size *
3808                                          (conf->geo.raid_disks / conf->geo.near_copies));
3809         }
3810
3811         rdev_for_each(rdev, mddev) {
3812                 long long diff;
3813
3814                 disk_idx = rdev->raid_disk;
3815                 if (disk_idx < 0)
3816                         continue;
3817                 if (disk_idx >= conf->geo.raid_disks &&
3818                     disk_idx >= conf->prev.raid_disks)
3819                         continue;
3820                 disk = conf->mirrors + disk_idx;
3821
3822                 if (test_bit(Replacement, &rdev->flags)) {
3823                         if (disk->replacement)
3824                                 goto out_free_conf;
3825                         disk->replacement = rdev;
3826                 } else {
3827                         if (disk->rdev)
3828                                 goto out_free_conf;
3829                         disk->rdev = rdev;
3830                 }
3831                 diff = (rdev->new_data_offset - rdev->data_offset);
3832                 if (!mddev->reshape_backwards)
3833                         diff = -diff;
3834                 if (diff < 0)
3835                         diff = 0;
3836                 if (first || diff < min_offset_diff)
3837                         min_offset_diff = diff;
3838
3839                 if (mddev->gendisk)
3840                         disk_stack_limits(mddev->gendisk, rdev->bdev,
3841                                           rdev->data_offset << 9);
3842
3843                 disk->head_position = 0;
3844
3845                 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3846                         discard_supported = true;
3847                 first = 0;
3848         }
3849
3850         if (mddev->queue) {
3851                 if (discard_supported)
3852                         blk_queue_flag_set(QUEUE_FLAG_DISCARD,
3853                                                 mddev->queue);
3854                 else
3855                         blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
3856                                                   mddev->queue);
3857         }
3858         /* need to check that every block has at least one working mirror */
3859         if (!enough(conf, -1)) {
3860                 pr_err("md/raid10:%s: not enough operational mirrors.\n",
3861                        mdname(mddev));
3862                 goto out_free_conf;
3863         }
3864
3865         if (conf->reshape_progress != MaxSector) {
3866                 /* must ensure that shape change is supported */
3867                 if (conf->geo.far_copies != 1 &&
3868                     conf->geo.far_offset == 0)
3869                         goto out_free_conf;
3870                 if (conf->prev.far_copies != 1 &&
3871                     conf->prev.far_offset == 0)
3872                         goto out_free_conf;
3873         }
3874
3875         mddev->degraded = 0;
3876         for (i = 0;
3877              i < conf->geo.raid_disks
3878                      || i < conf->prev.raid_disks;
3879              i++) {
3880
3881                 disk = conf->mirrors + i;
3882
3883                 if (!disk->rdev && disk->replacement) {
3884                         /* The replacement is all we have - use it */
3885                         disk->rdev = disk->replacement;
3886                         disk->replacement = NULL;
3887                         clear_bit(Replacement, &disk->rdev->flags);
3888                 }
3889
3890                 if (!disk->rdev ||
3891                     !test_bit(In_sync, &disk->rdev->flags)) {
3892                         disk->head_position = 0;
3893                         mddev->degraded++;
3894                         if (disk->rdev &&
3895                             disk->rdev->saved_raid_disk < 0)
3896                                 conf->fullsync = 1;
3897                 }
3898
3899                 if (disk->replacement &&
3900                     !test_bit(In_sync, &disk->replacement->flags) &&
3901                     disk->replacement->saved_raid_disk < 0) {
3902                         conf->fullsync = 1;
3903                 }
3904
3905                 disk->recovery_disabled = mddev->recovery_disabled - 1;
3906         }
3907
3908         if (mddev->recovery_cp != MaxSector)
3909                 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
3910                           mdname(mddev));
3911         pr_info("md/raid10:%s: active with %d out of %d devices\n",
3912                 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3913                 conf->geo.raid_disks);
3914         /*
3915          * Ok, everything is just fine now
3916          */
3917         mddev->dev_sectors = conf->dev_sectors;
3918         size = raid10_size(mddev, 0, 0);
3919         md_set_array_sectors(mddev, size);
3920         mddev->resync_max_sectors = size;
3921         set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
3922
3923         if (mddev->queue) {
3924                 int stripe = conf->geo.raid_disks *
3925                         ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3926
3927                 /* Calculate max read-ahead size.
3928                  * We need to readahead at least twice a whole stripe....
3929                  * maybe...
3930                  */
3931                 stripe /= conf->geo.near_copies;
3932                 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
3933                         mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
3934         }
3935
3936         if (md_integrity_register(mddev))
3937                 goto out_free_conf;
3938
3939         if (conf->reshape_progress != MaxSector) {
3940                 unsigned long before_length, after_length;
3941
3942                 before_length = ((1 << conf->prev.chunk_shift) *
3943                                  conf->prev.far_copies);
3944                 after_length = ((1 << conf->geo.chunk_shift) *
3945                                 conf->geo.far_copies);
3946
3947                 if (max(before_length, after_length) > min_offset_diff) {
3948                         /* This cannot work */
3949                         pr_warn("md/raid10: offset difference not enough to continue reshape\n");
3950                         goto out_free_conf;
3951                 }
3952                 conf->offset_diff = min_offset_diff;
3953
3954                 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3955                 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3956                 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3957                 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3958                 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3959                                                         "reshape");
3960         }
3961
3962         return 0;
3963
3964 out_free_conf:
3965         md_unregister_thread(&mddev->thread);
3966         mempool_exit(&conf->r10bio_pool);
3967         safe_put_page(conf->tmppage);
3968         kfree(conf->mirrors);
3969         kfree(conf);
3970         mddev->private = NULL;
3971 out:
3972         return -EIO;
3973 }
3974
3975 static void raid10_free(struct mddev *mddev, void *priv)
3976 {
3977         struct r10conf *conf = priv;
3978
3979         mempool_exit(&conf->r10bio_pool);
3980         safe_put_page(conf->tmppage);
3981         kfree(conf->mirrors);
3982         kfree(conf->mirrors_old);
3983         kfree(conf->mirrors_new);
3984         bioset_exit(&conf->bio_split);
3985         kfree(conf);
3986 }
3987
3988 static void raid10_quiesce(struct mddev *mddev, int quiesce)
3989 {
3990         struct r10conf *conf = mddev->private;
3991
3992         if (quiesce)
3993                 raise_barrier(conf, 0);
3994         else
3995                 lower_barrier(conf);
3996 }
3997
3998 static int raid10_resize(struct mddev *mddev, sector_t sectors)
3999 {
4000         /* Resize of 'far' arrays is not supported.
4001          * For 'near' and 'offset' arrays we can set the
4002          * number of sectors used to be an appropriate multiple
4003          * of the chunk size.
4004          * For 'offset', this is far_copies*chunksize.
4005          * For 'near' the multiplier is the LCM of
4006          * near_copies and raid_disks.
4007          * So if far_copies > 1 && !far_offset, fail.
4008          * Else find LCM(raid_disks, near_copy)*far_copies and
4009          * multiply by chunk_size.  Then round to this number.
4010          * This is mostly done by raid10_size()
4011          */
4012         struct r10conf *conf = mddev->private;
4013         sector_t oldsize, size;
4014
4015         if (mddev->reshape_position != MaxSector)
4016                 return -EBUSY;
4017
4018         if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
4019                 return -EINVAL;
4020
4021         oldsize = raid10_size(mddev, 0, 0);
4022         size = raid10_size(mddev, sectors, 0);
4023         if (mddev->external_size &&
4024             mddev->array_sectors > size)
4025                 return -EINVAL;
4026         if (mddev->bitmap) {
4027                 int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0);
4028                 if (ret)
4029                         return ret;
4030         }
4031         md_set_array_sectors(mddev, size);
4032         if (sectors > mddev->dev_sectors &&
4033             mddev->recovery_cp > oldsize) {
4034                 mddev->recovery_cp = oldsize;
4035                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4036         }
4037         calc_sectors(conf, sectors);
4038         mddev->dev_sectors = conf->dev_sectors;
4039         mddev->resync_max_sectors = size;
4040         return 0;
4041 }
4042
4043 static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
4044 {
4045         struct md_rdev *rdev;
4046         struct r10conf *conf;
4047
4048         if (mddev->degraded > 0) {
4049                 pr_warn("md/raid10:%s: Error: degraded raid0!\n",
4050                         mdname(mddev));
4051                 return ERR_PTR(-EINVAL);
4052         }
4053         sector_div(size, devs);
4054
4055         /* Set new parameters */
4056         mddev->new_level = 10;
4057         /* new layout: far_copies = 1, near_copies = 2 */
4058         mddev->new_layout = (1<<8) + 2;
4059         mddev->new_chunk_sectors = mddev->chunk_sectors;
4060         mddev->delta_disks = mddev->raid_disks;
4061         mddev->raid_disks *= 2;
4062         /* make sure it will be not marked as dirty */
4063         mddev->recovery_cp = MaxSector;
4064         mddev->dev_sectors = size;
4065
4066         conf = setup_conf(mddev);
4067         if (!IS_ERR(conf)) {
4068                 rdev_for_each(rdev, mddev)
4069                         if (rdev->raid_disk >= 0) {
4070                                 rdev->new_raid_disk = rdev->raid_disk * 2;
4071                                 rdev->sectors = size;
4072                         }
4073                 conf->barrier = 1;
4074         }
4075
4076         return conf;
4077 }
4078
4079 static void *raid10_takeover(struct mddev *mddev)
4080 {
4081         struct r0conf *raid0_conf;
4082
4083         /* raid10 can take over:
4084          *  raid0 - providing it has only two drives
4085          */
4086         if (mddev->level == 0) {
4087                 /* for raid0 takeover only one zone is supported */
4088                 raid0_conf = mddev->private;
4089                 if (raid0_conf->nr_strip_zones > 1) {
4090                         pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n",
4091                                 mdname(mddev));
4092                         return ERR_PTR(-EINVAL);
4093                 }
4094                 return raid10_takeover_raid0(mddev,
4095                         raid0_conf->strip_zone->zone_end,
4096                         raid0_conf->strip_zone->nb_dev);
4097         }
4098         return ERR_PTR(-EINVAL);
4099 }
4100
4101 static int raid10_check_reshape(struct mddev *mddev)
4102 {
4103         /* Called when there is a request to change
4104          * - layout (to ->new_layout)
4105          * - chunk size (to ->new_chunk_sectors)
4106          * - raid_disks (by delta_disks)
4107          * or when trying to restart a reshape that was ongoing.
4108          *
4109          * We need to validate the request and possibly allocate
4110          * space if that might be an issue later.
4111          *
4112          * Currently we reject any reshape of a 'far' mode array,
4113          * allow chunk size to change if new is generally acceptable,
4114          * allow raid_disks to increase, and allow
4115          * a switch between 'near' mode and 'offset' mode.
4116          */
4117         struct r10conf *conf = mddev->private;
4118         struct geom geo;
4119
4120         if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
4121                 return -EINVAL;
4122
4123         if (setup_geo(&geo, mddev, geo_start) != conf->copies)
4124                 /* mustn't change number of copies */
4125                 return -EINVAL;
4126         if (geo.far_copies > 1 && !geo.far_offset)
4127                 /* Cannot switch to 'far' mode */
4128                 return -EINVAL;
4129
4130         if (mddev->array_sectors & geo.chunk_mask)
4131                         /* not factor of array size */
4132                         return -EINVAL;
4133
4134         if (!enough(conf, -1))
4135                 return -EINVAL;
4136
4137         kfree(conf->mirrors_new);
4138         conf->mirrors_new = NULL;
4139         if (mddev->delta_disks > 0) {
4140                 /* allocate new 'mirrors' list */
4141                 conf->mirrors_new =
4142                         kcalloc(mddev->raid_disks + mddev->delta_disks,
4143                                 sizeof(struct raid10_info),
4144                                 GFP_KERNEL);
4145                 if (!conf->mirrors_new)
4146                         return -ENOMEM;
4147         }
4148         return 0;
4149 }
4150
4151 /*
4152  * Need to check if array has failed when deciding whether to:
4153  *  - start an array
4154  *  - remove non-faulty devices
4155  *  - add a spare
4156  *  - allow a reshape
4157  * This determination is simple when no reshape is happening.
4158  * However if there is a reshape, we need to carefully check
4159  * both the before and after sections.
4160  * This is because some failed devices may only affect one
4161  * of the two sections, and some non-in_sync devices may
4162  * be insync in the section most affected by failed devices.
4163  */
4164 static int calc_degraded(struct r10conf *conf)
4165 {
4166         int degraded, degraded2;
4167         int i;
4168
4169         rcu_read_lock();
4170         degraded = 0;
4171         /* 'prev' section first */
4172         for (i = 0; i < conf->prev.raid_disks; i++) {
4173                 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4174                 if (!rdev || test_bit(Faulty, &rdev->flags))
4175                         degraded++;
4176                 else if (!test_bit(In_sync, &rdev->flags))
4177                         /* When we can reduce the number of devices in
4178                          * an array, this might not contribute to
4179                          * 'degraded'.  It does now.
4180                          */
4181                         degraded++;
4182         }
4183         rcu_read_unlock();
4184         if (conf->geo.raid_disks == conf->prev.raid_disks)
4185                 return degraded;
4186         rcu_read_lock();
4187         degraded2 = 0;
4188         for (i = 0; i < conf->geo.raid_disks; i++) {
4189                 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4190                 if (!rdev || test_bit(Faulty, &rdev->flags))
4191                         degraded2++;
4192                 else if (!test_bit(In_sync, &rdev->flags)) {
4193                         /* If reshape is increasing the number of devices,
4194                          * this section has already been recovered, so
4195                          * it doesn't contribute to degraded.
4196                          * else it does.
4197                          */
4198                         if (conf->geo.raid_disks <= conf->prev.raid_disks)
4199                                 degraded2++;
4200                 }
4201         }
4202         rcu_read_unlock();
4203         if (degraded2 > degraded)
4204                 return degraded2;
4205         return degraded;
4206 }
4207
4208 static int raid10_start_reshape(struct mddev *mddev)
4209 {
4210         /* A 'reshape' has been requested. This commits
4211          * the various 'new' fields and sets MD_RECOVER_RESHAPE
4212          * This also checks if there are enough spares and adds them
4213          * to the array.
4214          * We currently require enough spares to make the final
4215          * array non-degraded.  We also require that the difference
4216          * between old and new data_offset - on each device - is
4217          * enough that we never risk over-writing.
4218          */
4219
4220         unsigned long before_length, after_length;
4221         sector_t min_offset_diff = 0;
4222         int first = 1;
4223         struct geom new;
4224         struct r10conf *conf = mddev->private;
4225         struct md_rdev *rdev;
4226         int spares = 0;
4227         int ret;
4228
4229         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4230                 return -EBUSY;
4231
4232         if (setup_geo(&new, mddev, geo_start) != conf->copies)
4233                 return -EINVAL;
4234
4235         before_length = ((1 << conf->prev.chunk_shift) *
4236                          conf->prev.far_copies);
4237         after_length = ((1 << conf->geo.chunk_shift) *
4238                         conf->geo.far_copies);
4239
4240         rdev_for_each(rdev, mddev) {
4241                 if (!test_bit(In_sync, &rdev->flags)
4242                     && !test_bit(Faulty, &rdev->flags))
4243                         spares++;
4244                 if (rdev->raid_disk >= 0) {
4245                         long long diff = (rdev->new_data_offset
4246                                           - rdev->data_offset);
4247                         if (!mddev->reshape_backwards)
4248                                 diff = -diff;
4249                         if (diff < 0)
4250                                 diff = 0;
4251                         if (first || diff < min_offset_diff)
4252                                 min_offset_diff = diff;
4253                         first = 0;
4254                 }
4255         }
4256
4257         if (max(before_length, after_length) > min_offset_diff)
4258                 return -EINVAL;
4259
4260         if (spares < mddev->delta_disks)
4261                 return -EINVAL;
4262
4263         conf->offset_diff = min_offset_diff;
4264         spin_lock_irq(&conf->device_lock);
4265         if (conf->mirrors_new) {
4266                 memcpy(conf->mirrors_new, conf->mirrors,
4267                        sizeof(struct raid10_info)*conf->prev.raid_disks);
4268                 smp_mb();
4269                 kfree(conf->mirrors_old);
4270                 conf->mirrors_old = conf->mirrors;
4271                 conf->mirrors = conf->mirrors_new;
4272                 conf->mirrors_new = NULL;
4273         }
4274         setup_geo(&conf->geo, mddev, geo_start);
4275         smp_mb();
4276         if (mddev->reshape_backwards) {
4277                 sector_t size = raid10_size(mddev, 0, 0);
4278                 if (size < mddev->array_sectors) {
4279                         spin_unlock_irq(&conf->device_lock);
4280                         pr_warn("md/raid10:%s: array size must be reduce before number of disks\n",
4281                                 mdname(mddev));
4282                         return -EINVAL;
4283                 }
4284                 mddev->resync_max_sectors = size;
4285                 conf->reshape_progress = size;
4286         } else
4287                 conf->reshape_progress = 0;
4288         conf->reshape_safe = conf->reshape_progress;
4289         spin_unlock_irq(&conf->device_lock);
4290
4291         if (mddev->delta_disks && mddev->bitmap) {
4292                 struct mdp_superblock_1 *sb = NULL;
4293                 sector_t oldsize, newsize;
4294
4295                 oldsize = raid10_size(mddev, 0, 0);
4296                 newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
4297
4298                 if (!mddev_is_clustered(mddev)) {
4299                         ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4300                         if (ret)
4301                                 goto abort;
4302                         else
4303                                 goto out;
4304                 }
4305
4306                 rdev_for_each(rdev, mddev) {
4307                         if (rdev->raid_disk > -1 &&
4308                             !test_bit(Faulty, &rdev->flags))
4309                                 sb = page_address(rdev->sb_page);
4310                 }
4311
4312                 /*
4313                  * some node is already performing reshape, and no need to
4314                  * call md_bitmap_resize again since it should be called when
4315                  * receiving BITMAP_RESIZE msg
4316                  */
4317                 if ((sb && (le32_to_cpu(sb->feature_map) &
4318                             MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
4319                         goto out;
4320
4321                 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4322                 if (ret)
4323                         goto abort;
4324
4325                 ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
4326                 if (ret) {
4327                         md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
4328                         goto abort;
4329                 }
4330         }
4331 out:
4332         if (mddev->delta_disks > 0) {
4333                 rdev_for_each(rdev, mddev)
4334                         if (rdev->raid_disk < 0 &&
4335                             !test_bit(Faulty, &rdev->flags)) {
4336                                 if (raid10_add_disk(mddev, rdev) == 0) {
4337                                         if (rdev->raid_disk >=
4338                                             conf->prev.raid_disks)
4339                                                 set_bit(In_sync, &rdev->flags);
4340                                         else
4341                                                 rdev->recovery_offset = 0;
4342
4343                                         if (sysfs_link_rdev(mddev, rdev))
4344                                                 /* Failure here  is OK */;
4345                                 }
4346                         } else if (rdev->raid_disk >= conf->prev.raid_disks
4347                                    && !test_bit(Faulty, &rdev->flags)) {
4348                                 /* This is a spare that was manually added */
4349                                 set_bit(In_sync, &rdev->flags);
4350                         }
4351         }
4352         /* When a reshape changes the number of devices,
4353          * ->degraded is measured against the larger of the
4354          * pre and  post numbers.
4355          */
4356         spin_lock_irq(&conf->device_lock);
4357         mddev->degraded = calc_degraded(conf);
4358         spin_unlock_irq(&conf->device_lock);
4359         mddev->raid_disks = conf->geo.raid_disks;
4360         mddev->reshape_position = conf->reshape_progress;
4361         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4362
4363         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4364         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4365         clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4366         set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4367         set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4368
4369         mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4370                                                 "reshape");
4371         if (!mddev->sync_thread) {
4372                 ret = -EAGAIN;
4373                 goto abort;
4374         }
4375         conf->reshape_checkpoint = jiffies;
4376         md_wakeup_thread(mddev->sync_thread);
4377         md_new_event(mddev);
4378         return 0;
4379
4380 abort:
4381         mddev->recovery = 0;
4382         spin_lock_irq(&conf->device_lock);
4383         conf->geo = conf->prev;
4384         mddev->raid_disks = conf->geo.raid_disks;
4385         rdev_for_each(rdev, mddev)
4386                 rdev->new_data_offset = rdev->data_offset;
4387         smp_wmb();
4388         conf->reshape_progress = MaxSector;
4389         conf->reshape_safe = MaxSector;
4390         mddev->reshape_position = MaxSector;
4391         spin_unlock_irq(&conf->device_lock);
4392         return ret;
4393 }
4394
4395 /* Calculate the last device-address that could contain
4396  * any block from the chunk that includes the array-address 's'
4397  * and report the next address.
4398  * i.e. the address returned will be chunk-aligned and after
4399  * any data that is in the chunk containing 's'.
4400  */
4401 static sector_t last_dev_address(sector_t s, struct geom *geo)
4402 {
4403         s = (s | geo->chunk_mask) + 1;
4404         s >>= geo->chunk_shift;
4405         s *= geo->near_copies;
4406         s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4407         s *= geo->far_copies;
4408         s <<= geo->chunk_shift;
4409         return s;
4410 }
4411
4412 /* Calculate the first device-address that could contain
4413  * any block from the chunk that includes the array-address 's'.
4414  * This too will be the start of a chunk
4415  */
4416 static sector_t first_dev_address(sector_t s, struct geom *geo)
4417 {
4418         s >>= geo->chunk_shift;
4419         s *= geo->near_copies;
4420         sector_div(s, geo->raid_disks);
4421         s *= geo->far_copies;
4422         s <<= geo->chunk_shift;
4423         return s;
4424 }
4425
4426 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4427                                 int *skipped)
4428 {
4429         /* We simply copy at most one chunk (smallest of old and new)
4430          * at a time, possibly less if that exceeds RESYNC_PAGES,
4431          * or we hit a bad block or something.
4432          * This might mean we pause for normal IO in the middle of
4433          * a chunk, but that is not a problem as mddev->reshape_position
4434          * can record any location.
4435          *
4436          * If we will want to write to a location that isn't
4437          * yet recorded as 'safe' (i.e. in metadata on disk) then
4438          * we need to flush all reshape requests and update the metadata.
4439          *
4440          * When reshaping forwards (e.g. to more devices), we interpret
4441          * 'safe' as the earliest block which might not have been copied
4442          * down yet.  We divide this by previous stripe size and multiply
4443          * by previous stripe length to get lowest device offset that we
4444          * cannot write to yet.
4445          * We interpret 'sector_nr' as an address that we want to write to.
4446          * From this we use last_device_address() to find where we might
4447          * write to, and first_device_address on the  'safe' position.
4448          * If this 'next' write position is after the 'safe' position,
4449          * we must update the metadata to increase the 'safe' position.
4450          *
4451          * When reshaping backwards, we round in the opposite direction
4452          * and perform the reverse test:  next write position must not be
4453          * less than current safe position.
4454          *
4455          * In all this the minimum difference in data offsets
4456          * (conf->offset_diff - always positive) allows a bit of slack,
4457          * so next can be after 'safe', but not by more than offset_diff
4458          *
4459          * We need to prepare all the bios here before we start any IO
4460          * to ensure the size we choose is acceptable to all devices.
4461          * The means one for each copy for write-out and an extra one for
4462          * read-in.
4463          * We store the read-in bio in ->master_bio and the others in
4464          * ->devs[x].bio and ->devs[x].repl_bio.
4465          */
4466         struct r10conf *conf = mddev->private;
4467         struct r10bio *r10_bio;
4468         sector_t next, safe, last;
4469         int max_sectors;
4470         int nr_sectors;
4471         int s;
4472         struct md_rdev *rdev;
4473         int need_flush = 0;
4474         struct bio *blist;
4475         struct bio *bio, *read_bio;
4476         int sectors_done = 0;
4477         struct page **pages;
4478
4479         if (sector_nr == 0) {
4480                 /* If restarting in the middle, skip the initial sectors */
4481                 if (mddev->reshape_backwards &&
4482                     conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4483                         sector_nr = (raid10_size(mddev, 0, 0)
4484                                      - conf->reshape_progress);
4485                 } else if (!mddev->reshape_backwards &&
4486                            conf->reshape_progress > 0)
4487                         sector_nr = conf->reshape_progress;
4488                 if (sector_nr) {
4489                         mddev->curr_resync_completed = sector_nr;
4490                         sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4491                         *skipped = 1;
4492                         return sector_nr;
4493                 }
4494         }
4495
4496         /* We don't use sector_nr to track where we are up to
4497          * as that doesn't work well for ->reshape_backwards.
4498          * So just use ->reshape_progress.
4499          */
4500         if (mddev->reshape_backwards) {
4501                 /* 'next' is the earliest device address that we might
4502                  * write to for this chunk in the new layout
4503                  */
4504                 next = first_dev_address(conf->reshape_progress - 1,
4505                                          &conf->geo);
4506
4507                 /* 'safe' is the last device address that we might read from
4508                  * in the old layout after a restart
4509                  */
4510                 safe = last_dev_address(conf->reshape_safe - 1,
4511                                         &conf->prev);
4512
4513                 if (next + conf->offset_diff < safe)
4514                         need_flush = 1;
4515
4516                 last = conf->reshape_progress - 1;
4517                 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4518                                                & conf->prev.chunk_mask);
4519                 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4520                         sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4521         } else {
4522                 /* 'next' is after the last device address that we
4523                  * might write to for this chunk in the new layout
4524                  */
4525                 next = last_dev_address(conf->reshape_progress, &conf->geo);
4526
4527                 /* 'safe' is the earliest device address that we might
4528                  * read from in the old layout after a restart
4529                  */
4530                 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4531
4532                 /* Need to update metadata if 'next' might be beyond 'safe'
4533                  * as that would possibly corrupt data
4534                  */
4535                 if (next > safe + conf->offset_diff)
4536                         need_flush = 1;
4537
4538                 sector_nr = conf->reshape_progress;
4539                 last  = sector_nr | (conf->geo.chunk_mask
4540                                      & conf->prev.chunk_mask);
4541
4542                 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4543                         last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4544         }
4545
4546         if (need_flush ||
4547             time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4548                 /* Need to update reshape_position in metadata */
4549                 wait_barrier(conf);
4550                 mddev->reshape_position = conf->reshape_progress;
4551                 if (mddev->reshape_backwards)
4552                         mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4553                                 - conf->reshape_progress;
4554                 else
4555                         mddev->curr_resync_completed = conf->reshape_progress;
4556                 conf->reshape_checkpoint = jiffies;
4557                 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4558                 md_wakeup_thread(mddev->thread);
4559                 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
4560                            test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4561                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4562                         allow_barrier(conf);
4563                         return sectors_done;
4564                 }
4565                 conf->reshape_safe = mddev->reshape_position;
4566                 allow_barrier(conf);
4567         }
4568
4569         raise_barrier(conf, 0);
4570 read_more:
4571         /* Now schedule reads for blocks from sector_nr to last */
4572         r10_bio = raid10_alloc_init_r10buf(conf);
4573         r10_bio->state = 0;
4574         raise_barrier(conf, 1);
4575         atomic_set(&r10_bio->remaining, 0);
4576         r10_bio->mddev = mddev;
4577         r10_bio->sector = sector_nr;
4578         set_bit(R10BIO_IsReshape, &r10_bio->state);
4579         r10_bio->sectors = last - sector_nr + 1;
4580         rdev = read_balance(conf, r10_bio, &max_sectors);
4581         BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4582
4583         if (!rdev) {
4584                 /* Cannot read from here, so need to record bad blocks
4585                  * on all the target devices.
4586                  */
4587                 // FIXME
4588                 mempool_free(r10_bio, &conf->r10buf_pool);
4589                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4590                 return sectors_done;
4591         }
4592
4593         read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4594
4595         bio_set_dev(read_bio, rdev->bdev);
4596         read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4597                                + rdev->data_offset);
4598         read_bio->bi_private = r10_bio;
4599         read_bio->bi_end_io = end_reshape_read;
4600         bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
4601         read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
4602         read_bio->bi_status = 0;
4603         read_bio->bi_vcnt = 0;
4604         read_bio->bi_iter.bi_size = 0;
4605         r10_bio->master_bio = read_bio;
4606         r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4607
4608         /*
4609          * Broadcast RESYNC message to other nodes, so all nodes would not
4610          * write to the region to avoid conflict.
4611         */
4612         if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
4613                 struct mdp_superblock_1 *sb = NULL;
4614                 int sb_reshape_pos = 0;
4615
4616                 conf->cluster_sync_low = sector_nr;
4617                 conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
4618                 sb = page_address(rdev->sb_page);
4619                 if (sb) {
4620                         sb_reshape_pos = le64_to_cpu(sb->reshape_position);
4621                         /*
4622                          * Set cluster_sync_low again if next address for array
4623                          * reshape is less than cluster_sync_low. Since we can't
4624                          * update cluster_sync_low until it has finished reshape.
4625                          */
4626                         if (sb_reshape_pos < conf->cluster_sync_low)
4627                                 conf->cluster_sync_low = sb_reshape_pos;
4628                 }
4629
4630                 md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
4631                                                           conf->cluster_sync_high);
4632         }
4633
4634         /* Now find the locations in the new layout */
4635         __raid10_find_phys(&conf->geo, r10_bio);
4636
4637         blist = read_bio;
4638         read_bio->bi_next = NULL;
4639
4640         rcu_read_lock();
4641         for (s = 0; s < conf->copies*2; s++) {
4642                 struct bio *b;
4643                 int d = r10_bio->devs[s/2].devnum;
4644                 struct md_rdev *rdev2;
4645                 if (s&1) {
4646                         rdev2 = rcu_dereference(conf->mirrors[d].replacement);
4647                         b = r10_bio->devs[s/2].repl_bio;
4648                 } else {
4649                         rdev2 = rcu_dereference(conf->mirrors[d].rdev);
4650                         b = r10_bio->devs[s/2].bio;
4651                 }
4652                 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4653                         continue;
4654
4655                 bio_set_dev(b, rdev2->bdev);
4656                 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
4657                         rdev2->new_data_offset;
4658                 b->bi_end_io = end_reshape_write;
4659                 bio_set_op_attrs(b, REQ_OP_WRITE, 0);
4660                 b->bi_next = blist;
4661                 blist = b;
4662         }
4663
4664         /* Now add as many pages as possible to all of these bios. */
4665
4666         nr_sectors = 0;
4667         pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4668         for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4669                 struct page *page = pages[s / (PAGE_SIZE >> 9)];
4670                 int len = (max_sectors - s) << 9;
4671                 if (len > PAGE_SIZE)
4672                         len = PAGE_SIZE;
4673                 for (bio = blist; bio ; bio = bio->bi_next) {
4674                         /*
4675                          * won't fail because the vec table is big enough
4676                          * to hold all these pages
4677                          */
4678                         bio_add_page(bio, page, len, 0);
4679                 }
4680                 sector_nr += len >> 9;
4681                 nr_sectors += len >> 9;
4682         }
4683         rcu_read_unlock();
4684         r10_bio->sectors = nr_sectors;
4685
4686         /* Now submit the read */
4687         md_sync_acct_bio(read_bio, r10_bio->sectors);
4688         atomic_inc(&r10_bio->remaining);
4689         read_bio->bi_next = NULL;
4690         generic_make_request(read_bio);
4691         sector_nr += nr_sectors;
4692         sectors_done += nr_sectors;
4693         if (sector_nr <= last)
4694                 goto read_more;
4695
4696         lower_barrier(conf);
4697
4698         /* Now that we have done the whole section we can
4699          * update reshape_progress
4700          */
4701         if (mddev->reshape_backwards)
4702                 conf->reshape_progress -= sectors_done;
4703         else
4704                 conf->reshape_progress += sectors_done;
4705
4706         return sectors_done;
4707 }
4708
4709 static void end_reshape_request(struct r10bio *r10_bio);
4710 static int handle_reshape_read_error(struct mddev *mddev,
4711                                      struct r10bio *r10_bio);
4712 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4713 {
4714         /* Reshape read completed.  Hopefully we have a block
4715          * to write out.
4716          * If we got a read error then we do sync 1-page reads from
4717          * elsewhere until we find the data - or give up.
4718          */
4719         struct r10conf *conf = mddev->private;
4720         int s;
4721
4722         if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4723                 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4724                         /* Reshape has been aborted */
4725                         md_done_sync(mddev, r10_bio->sectors, 0);
4726                         return;
4727                 }
4728
4729         /* We definitely have the data in the pages, schedule the
4730          * writes.
4731          */
4732         atomic_set(&r10_bio->remaining, 1);
4733         for (s = 0; s < conf->copies*2; s++) {
4734                 struct bio *b;
4735                 int d = r10_bio->devs[s/2].devnum;
4736                 struct md_rdev *rdev;
4737                 rcu_read_lock();
4738                 if (s&1) {
4739                         rdev = rcu_dereference(conf->mirrors[d].replacement);
4740                         b = r10_bio->devs[s/2].repl_bio;
4741                 } else {
4742                         rdev = rcu_dereference(conf->mirrors[d].rdev);
4743                         b = r10_bio->devs[s/2].bio;
4744                 }
4745                 if (!rdev || test_bit(Faulty, &rdev->flags)) {
4746                         rcu_read_unlock();
4747                         continue;
4748                 }
4749                 atomic_inc(&rdev->nr_pending);
4750                 rcu_read_unlock();
4751                 md_sync_acct_bio(b, r10_bio->sectors);
4752                 atomic_inc(&r10_bio->remaining);
4753                 b->bi_next = NULL;
4754                 generic_make_request(b);
4755         }
4756         end_reshape_request(r10_bio);
4757 }
4758
4759 static void end_reshape(struct r10conf *conf)
4760 {
4761         if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4762                 return;
4763
4764         spin_lock_irq(&conf->device_lock);
4765         conf->prev = conf->geo;
4766         md_finish_reshape(conf->mddev);
4767         smp_wmb();
4768         conf->reshape_progress = MaxSector;
4769         conf->reshape_safe = MaxSector;
4770         spin_unlock_irq(&conf->device_lock);
4771
4772         /* read-ahead size must cover two whole stripes, which is
4773          * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4774          */
4775         if (conf->mddev->queue) {
4776                 int stripe = conf->geo.raid_disks *
4777                         ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4778                 stripe /= conf->geo.near_copies;
4779                 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
4780                         conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
4781         }
4782         conf->fullsync = 0;
4783 }
4784
4785 static void raid10_update_reshape_pos(struct mddev *mddev)
4786 {
4787         struct r10conf *conf = mddev->private;
4788         sector_t lo, hi;
4789
4790         md_cluster_ops->resync_info_get(mddev, &lo, &hi);
4791         if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
4792             || mddev->reshape_position == MaxSector)
4793                 conf->reshape_progress = mddev->reshape_position;
4794         else
4795                 WARN_ON_ONCE(1);
4796 }
4797
4798 static int handle_reshape_read_error(struct mddev *mddev,
4799                                      struct r10bio *r10_bio)
4800 {
4801         /* Use sync reads to get the blocks from somewhere else */
4802         int sectors = r10_bio->sectors;
4803         struct r10conf *conf = mddev->private;
4804         struct r10bio *r10b;
4805         int slot = 0;
4806         int idx = 0;
4807         struct page **pages;
4808
4809         r10b = kmalloc(sizeof(*r10b) +
4810                sizeof(struct r10dev) * conf->copies, GFP_NOIO);
4811         if (!r10b) {
4812                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4813                 return -ENOMEM;
4814         }
4815
4816         /* reshape IOs share pages from .devs[0].bio */
4817         pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4818
4819         r10b->sector = r10_bio->sector;
4820         __raid10_find_phys(&conf->prev, r10b);
4821
4822         while (sectors) {
4823                 int s = sectors;
4824                 int success = 0;
4825                 int first_slot = slot;
4826
4827                 if (s > (PAGE_SIZE >> 9))
4828                         s = PAGE_SIZE >> 9;
4829
4830                 rcu_read_lock();
4831                 while (!success) {
4832                         int d = r10b->devs[slot].devnum;
4833                         struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4834                         sector_t addr;
4835                         if (rdev == NULL ||
4836                             test_bit(Faulty, &rdev->flags) ||
4837                             !test_bit(In_sync, &rdev->flags))
4838                                 goto failed;
4839
4840                         addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4841                         atomic_inc(&rdev->nr_pending);
4842                         rcu_read_unlock();
4843                         success = sync_page_io(rdev,
4844                                                addr,
4845                                                s << 9,
4846                                                pages[idx],
4847                                                REQ_OP_READ, 0, false);
4848                         rdev_dec_pending(rdev, mddev);
4849                         rcu_read_lock();
4850                         if (success)
4851                                 break;
4852                 failed:
4853                         slot++;
4854                         if (slot >= conf->copies)
4855                                 slot = 0;
4856                         if (slot == first_slot)
4857                                 break;
4858                 }
4859                 rcu_read_unlock();
4860                 if (!success) {
4861                         /* couldn't read this block, must give up */
4862                         set_bit(MD_RECOVERY_INTR,
4863                                 &mddev->recovery);
4864                         kfree(r10b);
4865                         return -EIO;
4866                 }
4867                 sectors -= s;
4868                 idx++;
4869         }
4870         kfree(r10b);
4871         return 0;
4872 }
4873
4874 static void end_reshape_write(struct bio *bio)
4875 {
4876         struct r10bio *r10_bio = get_resync_r10bio(bio);
4877         struct mddev *mddev = r10_bio->mddev;
4878         struct r10conf *conf = mddev->private;
4879         int d;
4880         int slot;
4881         int repl;
4882         struct md_rdev *rdev = NULL;
4883
4884         d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4885         if (repl)
4886                 rdev = conf->mirrors[d].replacement;
4887         if (!rdev) {
4888                 smp_mb();
4889                 rdev = conf->mirrors[d].rdev;
4890         }
4891
4892         if (bio->bi_status) {
4893                 /* FIXME should record badblock */
4894                 md_error(mddev, rdev);
4895         }
4896
4897         rdev_dec_pending(rdev, mddev);
4898         end_reshape_request(r10_bio);
4899 }
4900
4901 static void end_reshape_request(struct r10bio *r10_bio)
4902 {
4903         if (!atomic_dec_and_test(&r10_bio->remaining))
4904                 return;
4905         md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4906         bio_put(r10_bio->master_bio);
4907         put_buf(r10_bio);
4908 }
4909
4910 static void raid10_finish_reshape(struct mddev *mddev)
4911 {
4912         struct r10conf *conf = mddev->private;
4913
4914         if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4915                 return;
4916
4917         if (mddev->delta_disks > 0) {
4918                 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4919                         mddev->recovery_cp = mddev->resync_max_sectors;
4920                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4921                 }
4922                 mddev->resync_max_sectors = mddev->array_sectors;
4923         } else {
4924                 int d;
4925                 rcu_read_lock();
4926                 for (d = conf->geo.raid_disks ;
4927                      d < conf->geo.raid_disks - mddev->delta_disks;
4928                      d++) {
4929                         struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4930                         if (rdev)
4931                                 clear_bit(In_sync, &rdev->flags);
4932                         rdev = rcu_dereference(conf->mirrors[d].replacement);
4933                         if (rdev)
4934                                 clear_bit(In_sync, &rdev->flags);
4935                 }
4936                 rcu_read_unlock();
4937         }
4938         mddev->layout = mddev->new_layout;
4939         mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4940         mddev->reshape_position = MaxSector;
4941         mddev->delta_disks = 0;
4942         mddev->reshape_backwards = 0;
4943 }
4944
4945 static struct md_personality raid10_personality =
4946 {
4947         .name           = "raid10",
4948         .level          = 10,
4949         .owner          = THIS_MODULE,
4950         .make_request   = raid10_make_request,
4951         .run            = raid10_run,
4952         .free           = raid10_free,
4953         .status         = raid10_status,
4954         .error_handler  = raid10_error,
4955         .hot_add_disk   = raid10_add_disk,
4956         .hot_remove_disk= raid10_remove_disk,
4957         .spare_active   = raid10_spare_active,
4958         .sync_request   = raid10_sync_request,
4959         .quiesce        = raid10_quiesce,
4960         .size           = raid10_size,
4961         .resize         = raid10_resize,
4962         .takeover       = raid10_takeover,
4963         .check_reshape  = raid10_check_reshape,
4964         .start_reshape  = raid10_start_reshape,
4965         .finish_reshape = raid10_finish_reshape,
4966         .update_reshape_pos = raid10_update_reshape_pos,
4967         .congested      = raid10_congested,
4968 };
4969
4970 static int __init raid_init(void)
4971 {
4972         return register_md_personality(&raid10_personality);
4973 }
4974
4975 static void raid_exit(void)
4976 {
4977         unregister_md_personality(&raid10_personality);
4978 }
4979
4980 module_init(raid_init);
4981 module_exit(raid_exit);
4982 MODULE_LICENSE("GPL");
4983 MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4984 MODULE_ALIAS("md-personality-9"); /* RAID10 */
4985 MODULE_ALIAS("md-raid10");
4986 MODULE_ALIAS("md-level-10");
4987
4988 module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);