md/raid5: simplfy delaying of writes while metadata is updated.
authorNeilBrown <neilb@suse.com>
Wed, 15 Mar 2017 03:05:12 +0000 (14:05 +1100)
committerShaohua Li <shli@fb.com>
Thu, 23 Mar 2017 02:15:57 +0000 (19:15 -0700)
If a device fails during a write, we must ensure the failure is
recorded in the metadata before the completion of the write is
acknowleged.

Commit c3cce6cda162 ("md/raid5: ensure device failure recorded before
write request returns.")  added code for this, but it was
unnecessarily complicated.  We already had similar functionality for
handling updates to the bad-block-list, thanks to Commit de393cdea66c
("md: make it easier to wait for bad blocks to be acknowledged.")

So revert most of the former commit, and instead avoid collecting
completed writes if MD_CHANGE_PENDING is set.  raid5d() will then flush
the metadata and retry the stripe_head.
As this change can leave a stripe_head ready for handling immediately
after handle_active_stripes() returns, we change raid5_do_work() to
pause when MD_CHANGE_PENDING is set, so that it doesn't spin.

We check MD_CHANGE_PENDING *after* analyse_stripe() as it could be set
asynchronously.  After analyse_stripe(), we have collected stable data
about the state of devices, which will be used to make decisions.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
drivers/md/raid5.c
drivers/md/raid5.h

index a684003fc9651bf6453234202049d4a5b307af8e..a2c9ddc353359b173f84dfbf80320b3a702aa686 100644 (file)
@@ -4691,7 +4691,8 @@ static void handle_stripe(struct stripe_head *sh)
        if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
                goto finish;
 
-       if (s.handle_bad_blocks) {
+       if (s.handle_bad_blocks ||
+           test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
                set_bit(STRIPE_HANDLE, &sh->state);
                goto finish;
        }
@@ -5021,15 +5022,8 @@ finish:
                        md_wakeup_thread(conf->mddev->thread);
        }
 
-       if (!bio_list_empty(&s.return_bi)) {
-               if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
-                       spin_lock_irq(&conf->device_lock);
-                       bio_list_merge(&conf->return_bi, &s.return_bi);
-                       spin_unlock_irq(&conf->device_lock);
-                       md_wakeup_thread(conf->mddev->thread);
-               } else
-                       return_io(&s.return_bi);
-       }
+       if (!bio_list_empty(&s.return_bi))
+               return_io(&s.return_bi);
 
        clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
 }
@@ -6226,6 +6220,7 @@ static void raid5_do_work(struct work_struct *work)
        struct r5worker *worker = container_of(work, struct r5worker, work);
        struct r5worker_group *group = worker->group;
        struct r5conf *conf = group->conf;
+       struct mddev *mddev = conf->mddev;
        int group_id = group - conf->worker_groups;
        int handled;
        struct blk_plug plug;
@@ -6246,6 +6241,9 @@ static void raid5_do_work(struct work_struct *work)
                if (!batch_size && !released)
                        break;
                handled += batch_size;
+               wait_event_lock_irq(mddev->sb_wait,
+                       !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
+                       conf->device_lock);
        }
        pr_debug("%d stripes handled\n", handled);
 
@@ -6273,18 +6271,6 @@ static void raid5d(struct md_thread *thread)
 
        md_check_recovery(mddev);
 
-       if (!bio_list_empty(&conf->return_bi) &&
-           !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-               struct bio_list tmp = BIO_EMPTY_LIST;
-               spin_lock_irq(&conf->device_lock);
-               if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-                       bio_list_merge(&tmp, &conf->return_bi);
-                       bio_list_init(&conf->return_bi);
-               }
-               spin_unlock_irq(&conf->device_lock);
-               return_io(&tmp);
-       }
-
        blk_start_plug(&plug);
        handled = 0;
        spin_lock_irq(&conf->device_lock);
@@ -6936,7 +6922,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        INIT_LIST_HEAD(&conf->hold_list);
        INIT_LIST_HEAD(&conf->delayed_list);
        INIT_LIST_HEAD(&conf->bitmap_list);
-       bio_list_init(&conf->return_bi);
        init_llist_head(&conf->released_stripes);
        atomic_set(&conf->active_stripes, 0);
        atomic_set(&conf->preread_active_stripes, 0);
index ba5b7a3790af05ab89baa002a0b937d3f3096422..13800dc9dd8861c9048f7213644da30f46ecd319 100644 (file)
@@ -638,9 +638,6 @@ struct r5conf {
        int                     skip_copy; /* Don't copy data from bio to stripe cache */
        struct list_head        *last_hold; /* detect hold_list promotions */
 
-       /* bios to have bi_end_io called after metadata is synced */
-       struct bio_list         return_bi;
-
        atomic_t                reshape_stripes; /* stripes with pending writes for reshape */
        /* unfortunately we need two cache names as we temporarily have
         * two caches.