Merge tag 'md-3.7-fixes' of git://neil.brown.name/md
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 23 Nov 2012 22:11:13 +0000 (12:11 -1000)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 23 Nov 2012 22:11:13 +0000 (12:11 -1000)
Pull md fixes from NeilBrown:
 "Several bug fixes for md in 3.7:

   - raid5 discard has problems
   - raid10 replacement devices have problems
   - bad block lock seqlock usage has problems
   - dm-raid doesn't free everything"

* tag 'md-3.7-fixes' of git://neil.brown.name/md:
  md/raid10: decrement correct pending counter when writing to replacement.
  md/raid10: close race that lose writes lost when replacement completes.
  md/raid5: Make sure we clear R5_Discard when discard is finished.
  md/raid5: move resolving of reconstruct_state earlier in stripe_handle.
  md/raid5: round discard alignment up to power of 2.
  md: make sure everything is freed when dm-raid stops an array.
  md: Avoid write invalid address if read_seqretry returned true.
  md: Reassigned the parameters if read_seqretry returned true in func md_is_badblock.

drivers/md/md.c
drivers/md/raid10.c
drivers/md/raid5.c

index 9ab768acfb623f8bbb13870e5f65150a60ecad07..61200717687b85b3fed040f9599ffd1987b8842d 100644 (file)
@@ -1817,10 +1817,10 @@ retry:
                        memset(bbp, 0xff, PAGE_SIZE);
 
                        for (i = 0 ; i < bb->count ; i++) {
-                               u64 internal_bb = *p++;
+                               u64 internal_bb = p[i];
                                u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
                                                | BB_LEN(internal_bb));
-                               *bbp++ = cpu_to_le64(store_bb);
+                               bbp[i] = cpu_to_le64(store_bb);
                        }
                        bb->changed = 0;
                        if (read_seqretry(&bb->lock, seq))
@@ -5294,7 +5294,7 @@ void md_stop_writes(struct mddev *mddev)
 }
 EXPORT_SYMBOL_GPL(md_stop_writes);
 
-void md_stop(struct mddev *mddev)
+static void __md_stop(struct mddev *mddev)
 {
        mddev->ready = 0;
        mddev->pers->stop(mddev);
@@ -5304,6 +5304,18 @@ void md_stop(struct mddev *mddev)
        mddev->pers = NULL;
        clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 }
+
+void md_stop(struct mddev *mddev)
+{
+       /* stop the array and free an attached data structures.
+        * This is called from dm-raid
+        */
+       __md_stop(mddev);
+       bitmap_destroy(mddev);
+       if (mddev->bio_set)
+               bioset_free(mddev->bio_set);
+}
+
 EXPORT_SYMBOL_GPL(md_stop);
 
 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
@@ -5364,7 +5376,7 @@ static int do_md_stop(struct mddev * mddev, int mode,
                        set_disk_ro(disk, 0);
 
                __md_stop_writes(mddev);
-               md_stop(mddev);
+               __md_stop(mddev);
                mddev->queue->merge_bvec_fn = NULL;
                mddev->queue->backing_dev_info.congested_fn = NULL;
 
@@ -7936,9 +7948,9 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
                   sector_t *first_bad, int *bad_sectors)
 {
        int hi;
-       int lo = 0;
+       int lo;
        u64 *p = bb->page;
-       int rv = 0;
+       int rv;
        sector_t target = s + sectors;
        unsigned seq;
 
@@ -7953,7 +7965,8 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
 
 retry:
        seq = read_seqbegin(&bb->lock);
-
+       lo = 0;
+       rv = 0;
        hi = bb->count;
 
        /* Binary search between lo and hi for 'target'
index d1295aff41739eea048f0e43513dfba6ade4c8f1..0d5d0ff2c0f7beb47a02bd4692273b31369b2bc5 100644 (file)
@@ -499,7 +499,7 @@ static void raid10_end_write_request(struct bio *bio, int error)
         */
        one_write_done(r10_bio);
        if (dec_rdev)
-               rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
+               rdev_dec_pending(rdev, conf->mddev);
 }
 
 /*
@@ -1334,18 +1334,21 @@ retry_write:
                        blocked_rdev = rrdev;
                        break;
                }
+               if (rdev && (test_bit(Faulty, &rdev->flags)
+                            || test_bit(Unmerged, &rdev->flags)))
+                       rdev = NULL;
                if (rrdev && (test_bit(Faulty, &rrdev->flags)
                              || test_bit(Unmerged, &rrdev->flags)))
                        rrdev = NULL;
 
                r10_bio->devs[i].bio = NULL;
                r10_bio->devs[i].repl_bio = NULL;
-               if (!rdev || test_bit(Faulty, &rdev->flags) ||
-                   test_bit(Unmerged, &rdev->flags)) {
+
+               if (!rdev && !rrdev) {
                        set_bit(R10BIO_Degraded, &r10_bio->state);
                        continue;
                }
-               if (test_bit(WriteErrorSeen, &rdev->flags)) {
+               if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
                        sector_t first_bad;
                        sector_t dev_sector = r10_bio->devs[i].addr;
                        int bad_sectors;
@@ -1387,8 +1390,10 @@ retry_write:
                                        max_sectors = good_sectors;
                        }
                }
-               r10_bio->devs[i].bio = bio;
-               atomic_inc(&rdev->nr_pending);
+               if (rdev) {
+                       r10_bio->devs[i].bio = bio;
+                       atomic_inc(&rdev->nr_pending);
+               }
                if (rrdev) {
                        r10_bio->devs[i].repl_bio = bio;
                        atomic_inc(&rrdev->nr_pending);
@@ -1444,69 +1449,71 @@ retry_write:
        for (i = 0; i < conf->copies; i++) {
                struct bio *mbio;
                int d = r10_bio->devs[i].devnum;
-               if (!r10_bio->devs[i].bio)
-                       continue;
+               if (r10_bio->devs[i].bio) {
+                       struct md_rdev *rdev = conf->mirrors[d].rdev;
+                       mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+                       md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
+                                   max_sectors);
+                       r10_bio->devs[i].bio = mbio;
+
+                       mbio->bi_sector = (r10_bio->devs[i].addr+
+                                          choose_data_offset(r10_bio,
+                                                             rdev));
+                       mbio->bi_bdev = rdev->bdev;
+                       mbio->bi_end_io = raid10_end_write_request;
+                       mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+                       mbio->bi_private = r10_bio;
 
-               mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-               md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
-                           max_sectors);
-               r10_bio->devs[i].bio = mbio;
+                       atomic_inc(&r10_bio->remaining);
 
-               mbio->bi_sector = (r10_bio->devs[i].addr+
-                                  choose_data_offset(r10_bio,
-                                                     conf->mirrors[d].rdev));
-               mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
-               mbio->bi_end_io = raid10_end_write_request;
-               mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
-               mbio->bi_private = r10_bio;
+                       cb = blk_check_plugged(raid10_unplug, mddev,
+                                              sizeof(*plug));
+                       if (cb)
+                               plug = container_of(cb, struct raid10_plug_cb,
+                                                   cb);
+                       else
+                               plug = NULL;
+                       spin_lock_irqsave(&conf->device_lock, flags);
+                       if (plug) {
+                               bio_list_add(&plug->pending, mbio);
+                               plug->pending_cnt++;
+                       } else {
+                               bio_list_add(&conf->pending_bio_list, mbio);
+                               conf->pending_count++;
+                       }
+                       spin_unlock_irqrestore(&conf->device_lock, flags);
+                       if (!plug)
+                               md_wakeup_thread(mddev->thread);
+               }
 
-               atomic_inc(&r10_bio->remaining);
+               if (r10_bio->devs[i].repl_bio) {
+                       struct md_rdev *rdev = conf->mirrors[d].replacement;
+                       if (rdev == NULL) {
+                               /* Replacement just got moved to main 'rdev' */
+                               smp_mb();
+                               rdev = conf->mirrors[d].rdev;
+                       }
+                       mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+                       md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
+                                   max_sectors);
+                       r10_bio->devs[i].repl_bio = mbio;
+
+                       mbio->bi_sector = (r10_bio->devs[i].addr +
+                                          choose_data_offset(
+                                                  r10_bio, rdev));
+                       mbio->bi_bdev = rdev->bdev;
+                       mbio->bi_end_io = raid10_end_write_request;
+                       mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+                       mbio->bi_private = r10_bio;
 
-               cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
-               if (cb)
-                       plug = container_of(cb, struct raid10_plug_cb, cb);
-               else
-                       plug = NULL;
-               spin_lock_irqsave(&conf->device_lock, flags);
-               if (plug) {
-                       bio_list_add(&plug->pending, mbio);
-                       plug->pending_cnt++;
-               } else {
+                       atomic_inc(&r10_bio->remaining);
+                       spin_lock_irqsave(&conf->device_lock, flags);
                        bio_list_add(&conf->pending_bio_list, mbio);
                        conf->pending_count++;
+                       spin_unlock_irqrestore(&conf->device_lock, flags);
+                       if (!mddev_check_plugged(mddev))
+                               md_wakeup_thread(mddev->thread);
                }
-               spin_unlock_irqrestore(&conf->device_lock, flags);
-               if (!plug)
-                       md_wakeup_thread(mddev->thread);
-
-               if (!r10_bio->devs[i].repl_bio)
-                       continue;
-
-               mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-               md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
-                           max_sectors);
-               r10_bio->devs[i].repl_bio = mbio;
-
-               /* We are actively writing to the original device
-                * so it cannot disappear, so the replacement cannot
-                * become NULL here
-                */
-               mbio->bi_sector = (r10_bio->devs[i].addr +
-                                  choose_data_offset(
-                                          r10_bio,
-                                          conf->mirrors[d].replacement));
-               mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
-               mbio->bi_end_io = raid10_end_write_request;
-               mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
-               mbio->bi_private = r10_bio;
-
-               atomic_inc(&r10_bio->remaining);
-               spin_lock_irqsave(&conf->device_lock, flags);
-               bio_list_add(&conf->pending_bio_list, mbio);
-               conf->pending_count++;
-               spin_unlock_irqrestore(&conf->device_lock, flags);
-               if (!mddev_check_plugged(mddev))
-                       md_wakeup_thread(mddev->thread);
        }
 
        /* Don't remove the bias on 'remaining' (one_write_done) until
index c5439dce0295078ecf82094af5474a649284ce61..a4502686e7a8763fb5aeded988f026b2a9dd1c99 100644 (file)
@@ -2774,10 +2774,12 @@ static void handle_stripe_clean_event(struct r5conf *conf,
                        dev = &sh->dev[i];
                        if (!test_bit(R5_LOCKED, &dev->flags) &&
                            (test_bit(R5_UPTODATE, &dev->flags) ||
-                            test_and_clear_bit(R5_Discard, &dev->flags))) {
+                            test_bit(R5_Discard, &dev->flags))) {
                                /* We can return any write requests */
                                struct bio *wbi, *wbi2;
                                pr_debug("Return write for disc %d\n", i);
+                               if (test_and_clear_bit(R5_Discard, &dev->flags))
+                                       clear_bit(R5_UPTODATE, &dev->flags);
                                wbi = dev->written;
                                dev->written = NULL;
                                while (wbi && wbi->bi_sector <
@@ -2795,7 +2797,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
                                         !test_bit(STRIPE_DEGRADED, &sh->state),
                                                0);
                        }
-               }
+               } else if (test_bit(R5_Discard, &sh->dev[i].flags))
+                       clear_bit(R5_Discard, &sh->dev[i].flags);
 
        if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
                if (atomic_dec_and_test(&conf->pending_full_writes))
@@ -3490,40 +3493,6 @@ static void handle_stripe(struct stripe_head *sh)
                        handle_failed_sync(conf, sh, &s);
        }
 
-       /*
-        * might be able to return some write requests if the parity blocks
-        * are safe, or on a failed drive
-        */
-       pdev = &sh->dev[sh->pd_idx];
-       s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
-               || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
-       qdev = &sh->dev[sh->qd_idx];
-       s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
-               || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
-               || conf->level < 6;
-
-       if (s.written &&
-           (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
-                            && !test_bit(R5_LOCKED, &pdev->flags)
-                            && (test_bit(R5_UPTODATE, &pdev->flags) ||
-                                test_bit(R5_Discard, &pdev->flags))))) &&
-           (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
-                            && !test_bit(R5_LOCKED, &qdev->flags)
-                            && (test_bit(R5_UPTODATE, &qdev->flags) ||
-                                test_bit(R5_Discard, &qdev->flags))))))
-               handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
-
-       /* Now we might consider reading some blocks, either to check/generate
-        * parity, or to satisfy requests
-        * or to load a block that is being partially written.
-        */
-       if (s.to_read || s.non_overwrite
-           || (conf->level == 6 && s.to_write && s.failed)
-           || (s.syncing && (s.uptodate + s.compute < disks))
-           || s.replacing
-           || s.expanding)
-               handle_stripe_fill(sh, &s, disks);
-
        /* Now we check to see if any write operations have recently
         * completed
         */
@@ -3561,6 +3530,40 @@ static void handle_stripe(struct stripe_head *sh)
                        s.dec_preread_active = 1;
        }
 
+       /*
+        * might be able to return some write requests if the parity blocks
+        * are safe, or on a failed drive
+        */
+       pdev = &sh->dev[sh->pd_idx];
+       s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
+               || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
+       qdev = &sh->dev[sh->qd_idx];
+       s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
+               || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
+               || conf->level < 6;
+
+       if (s.written &&
+           (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
+                            && !test_bit(R5_LOCKED, &pdev->flags)
+                            && (test_bit(R5_UPTODATE, &pdev->flags) ||
+                                test_bit(R5_Discard, &pdev->flags))))) &&
+           (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
+                            && !test_bit(R5_LOCKED, &qdev->flags)
+                            && (test_bit(R5_UPTODATE, &qdev->flags) ||
+                                test_bit(R5_Discard, &qdev->flags))))))
+               handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+
+       /* Now we might consider reading some blocks, either to check/generate
+        * parity, or to satisfy requests
+        * or to load a block that is being partially written.
+        */
+       if (s.to_read || s.non_overwrite
+           || (conf->level == 6 && s.to_write && s.failed)
+           || (s.syncing && (s.uptodate + s.compute < disks))
+           || s.replacing
+           || s.expanding)
+               handle_stripe_fill(sh, &s, disks);
+
        /* Now to consider new write requests and what else, if anything
         * should be read.  We do not handle new writes when:
         * 1/ A 'write' operation (copy+xor) is already in flight.
@@ -5529,6 +5532,10 @@ static int run(struct mddev *mddev)
                 * discard data disk but write parity disk
                 */
                stripe = stripe * PAGE_SIZE;
+               /* Round up to power of 2, as discard handling
+                * currently assumes that */
+               while ((stripe-1) & stripe)
+                       stripe = (stripe | (stripe-1)) + 1;
                mddev->queue->limits.discard_alignment = stripe;
                mddev->queue->limits.discard_granularity = stripe;
                /*