Merge branch 'for-linus-4.12-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...
[sfrench/cifs-2.6.git] / drivers / md / raid5.c
index 2efdb0d6746074a0416f18b0c0dfbab7cc5be6a8..2e38cfac5b1dc5a318f66b4bb6e2e2195797ad53 100644 (file)
 #include <linux/sched/signal.h>
 
 #include <trace/events/block.h>
+#include <linux/list_sort.h>
 
 #include "md.h"
 #include "raid5.h"
 #include "raid0.h"
 #include "bitmap.h"
+#include "raid5-log.h"
 
 #define UNSUPPORTED_MDDEV_FLAGS        (1L << MD_FAILFAST_SUPPORTED)
 
@@ -156,17 +158,6 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
        return slot;
 }
 
-static void return_io(struct bio_list *return_bi)
-{
-       struct bio *bi;
-       while ((bi = bio_list_pop(return_bi)) != NULL) {
-               bi->bi_iter.bi_size = 0;
-               trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
-                                        bi, 0);
-               bio_endio(bi);
-       }
-}
-
 static void print_raid5_conf (struct r5conf *conf);
 
 static int stripe_operations_active(struct stripe_head *sh)
@@ -176,6 +167,13 @@ static int stripe_operations_active(struct stripe_head *sh)
               test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 }
 
+static bool stripe_is_lowprio(struct stripe_head *sh)
+{
+       return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
+               test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
+              !test_bit(STRIPE_R5C_CACHING, &sh->state);
+}
+
 static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 {
        struct r5conf *conf = sh->raid_conf;
@@ -191,7 +189,10 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
        if (list_empty(&sh->lru)) {
                struct r5worker_group *group;
                group = conf->worker_groups + cpu_to_group(cpu);
-               list_add_tail(&sh->lru, &group->handle_list);
+               if (stripe_is_lowprio(sh))
+                       list_add_tail(&sh->lru, &group->loprio_list);
+               else
+                       list_add_tail(&sh->lru, &group->handle_list);
                group->stripes_cnt++;
                sh->group = group;
        }
@@ -254,7 +255,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
                        clear_bit(STRIPE_DELAYED, &sh->state);
                        clear_bit(STRIPE_BIT_DELAY, &sh->state);
                        if (conf->worker_cnt_per_group == 0) {
-                               list_add_tail(&sh->lru, &conf->handle_list);
+                               if (stripe_is_lowprio(sh))
+                                       list_add_tail(&sh->lru,
+                                                       &conf->loprio_list);
+                               else
+                                       list_add_tail(&sh->lru,
+                                                       &conf->handle_list);
                        } else {
                                raid5_wakeup_stripe_thread(sh);
                                return;
@@ -481,6 +487,7 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
                sh->dev[i].page = page;
                sh->dev[i].orig_page = page;
        }
+
        return 0;
 }
 
@@ -729,7 +736,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
 {
        struct r5conf *conf = sh->raid_conf;
 
-       if (conf->log)
+       if (conf->log || raid5_has_ppl(conf))
                return false;
        return test_bit(STRIPE_BATCH_READY, &sh->state) &&
                !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
@@ -863,41 +870,107 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
        return 1;
 }
 
-static void flush_deferred_bios(struct r5conf *conf)
+static void dispatch_bio_list(struct bio_list *tmp)
 {
-       struct bio_list tmp;
        struct bio *bio;
 
-       if (!conf->batch_bio_dispatch || !conf->group_cnt)
+       while ((bio = bio_list_pop(tmp)))
+               generic_make_request(bio);
+}
+
+static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
+{
+       const struct r5pending_data *da = list_entry(a,
+                               struct r5pending_data, sibling);
+       const struct r5pending_data *db = list_entry(b,
+                               struct r5pending_data, sibling);
+       if (da->sector > db->sector)
+               return 1;
+       if (da->sector < db->sector)
+               return -1;
+       return 0;
+}
+
+static void dispatch_defer_bios(struct r5conf *conf, int target,
+                               struct bio_list *list)
+{
+       struct r5pending_data *data;
+       struct list_head *first, *next = NULL;
+       int cnt = 0;
+
+       if (conf->pending_data_cnt == 0)
+               return;
+
+       list_sort(NULL, &conf->pending_list, cmp_stripe);
+
+       first = conf->pending_list.next;
+
+       /* temporarily move the head */
+       if (conf->next_pending_data)
+               list_move_tail(&conf->pending_list,
+                               &conf->next_pending_data->sibling);
+
+       while (!list_empty(&conf->pending_list)) {
+               data = list_first_entry(&conf->pending_list,
+                       struct r5pending_data, sibling);
+               if (&data->sibling == first)
+                       first = data->sibling.next;
+               next = data->sibling.next;
+
+               bio_list_merge(list, &data->bios);
+               list_move(&data->sibling, &conf->free_list);
+               cnt++;
+               if (cnt >= target)
+                       break;
+       }
+       conf->pending_data_cnt -= cnt;
+       BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
+
+       if (next != &conf->pending_list)
+               conf->next_pending_data = list_entry(next,
+                               struct r5pending_data, sibling);
+       else
+               conf->next_pending_data = NULL;
+       /* list isn't empty */
+       if (first != &conf->pending_list)
+               list_move_tail(&conf->pending_list, first);
+}
+
+static void flush_deferred_bios(struct r5conf *conf)
+{
+       struct bio_list tmp = BIO_EMPTY_LIST;
+
+       if (conf->pending_data_cnt == 0)
                return;
 
-       bio_list_init(&tmp);
        spin_lock(&conf->pending_bios_lock);
-       bio_list_merge(&tmp, &conf->pending_bios);
-       bio_list_init(&conf->pending_bios);
+       dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
+       BUG_ON(conf->pending_data_cnt != 0);
        spin_unlock(&conf->pending_bios_lock);
 
-       while ((bio = bio_list_pop(&tmp)))
-               generic_make_request(bio);
+       dispatch_bio_list(&tmp);
 }
 
-static void defer_bio_issue(struct r5conf *conf, struct bio *bio)
+static void defer_issue_bios(struct r5conf *conf, sector_t sector,
+                               struct bio_list *bios)
 {
-       /*
-        * change group_cnt will drain all bios, so this is safe
-        *
-        * A read generally means a read-modify-write, which usually means a
-        * randwrite, so we don't delay it
-        */
-       if (!conf->batch_bio_dispatch || !conf->group_cnt ||
-           bio_op(bio) == REQ_OP_READ) {
-               generic_make_request(bio);
-               return;
-       }
+       struct bio_list tmp = BIO_EMPTY_LIST;
+       struct r5pending_data *ent;
+
        spin_lock(&conf->pending_bios_lock);
-       bio_list_add(&conf->pending_bios, bio);
+       ent = list_first_entry(&conf->free_list, struct r5pending_data,
+                                                       sibling);
+       list_move_tail(&ent->sibling, &conf->pending_list);
+       ent->sector = sector;
+       bio_list_init(&ent->bios);
+       bio_list_merge(&ent->bios, bios);
+       conf->pending_data_cnt++;
+       if (conf->pending_data_cnt >= PENDING_IO_MAX)
+               dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
+
        spin_unlock(&conf->pending_bios_lock);
-       md_wakeup_thread(conf->mddev->thread);
+
+       dispatch_bio_list(&tmp);
 }
 
 static void
@@ -910,21 +983,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
        struct r5conf *conf = sh->raid_conf;
        int i, disks = sh->disks;
        struct stripe_head *head_sh = sh;
+       struct bio_list pending_bios = BIO_EMPTY_LIST;
+       bool should_defer;
 
        might_sleep();
 
-       if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
-               /* writing out phase */
-               if (s->waiting_extra_page)
-                       return;
-               if (r5l_write_stripe(conf->log, sh) == 0)
-                       return;
-       } else {  /* caching phase */
-               if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
-                       r5c_cache_data(conf->log, sh, s);
-                       return;
-               }
-       }
+       if (log_stripe(sh, s) == 0)
+               return;
+
+       should_defer = conf->batch_bio_dispatch && conf->group_cnt;
 
        for (i = disks; i--; ) {
                int op, op_flags = 0;
@@ -1080,7 +1147,10 @@ again:
                                trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
                                                      bi, disk_devt(conf->mddev->gendisk),
                                                      sh->dev[i].sector);
-                       defer_bio_issue(conf, bi);
+                       if (should_defer && op_is_write(op))
+                               bio_list_add(&pending_bios, bi);
+                       else
+                               generic_make_request(bi);
                }
                if (rrdev) {
                        if (s->syncing || s->expanding || s->expanded
@@ -1125,7 +1195,10 @@ again:
                                trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
                                                      rbi, disk_devt(conf->mddev->gendisk),
                                                      sh->dev[i].sector);
-                       defer_bio_issue(conf, rbi);
+                       if (should_defer && op_is_write(op))
+                               bio_list_add(&pending_bios, rbi);
+                       else
+                               generic_make_request(rbi);
                }
                if (!rdev && !rrdev) {
                        if (op_is_write(op))
@@ -1143,6 +1216,9 @@ again:
                if (sh != head_sh)
                        goto again;
        }
+
+       if (should_defer && !bio_list_empty(&pending_bios))
+               defer_issue_bios(conf, head_sh->sector, &pending_bios);
 }
 
 static struct dma_async_tx_descriptor *
@@ -1212,7 +1288,6 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
 static void ops_complete_biofill(void *stripe_head_ref)
 {
        struct stripe_head *sh = stripe_head_ref;
-       struct bio_list return_bi = BIO_EMPTY_LIST;
        int i;
 
        pr_debug("%s: stripe %llu\n", __func__,
@@ -1236,16 +1311,13 @@ static void ops_complete_biofill(void *stripe_head_ref)
                        while (rbi && rbi->bi_iter.bi_sector <
                                dev->sector + STRIPE_SECTORS) {
                                rbi2 = r5_next_bio(rbi, dev->sector);
-                               if (!raid5_dec_bi_active_stripes(rbi))
-                                       bio_list_add(&return_bi, rbi);
+                               bio_endio(rbi);
                                rbi = rbi2;
                        }
                }
        }
        clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
 
-       return_io(&return_bi);
-
        set_bit(STRIPE_HANDLE, &sh->state);
        raid5_release_stripe(sh);
 }
@@ -2014,6 +2086,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
                        tx = ops_run_prexor6(sh, percpu, tx);
        }
 
+       if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+               tx = ops_run_partial_parity(sh, percpu, tx);
+
        if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
                tx = ops_run_biodrain(sh, tx);
                overlap_clear++;
@@ -2046,8 +2121,15 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
        put_cpu();
 }
 
+static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
+{
+       if (sh->ppl_page)
+               __free_page(sh->ppl_page);
+       kmem_cache_free(sc, sh);
+}
+
 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
-       int disks)
+       int disks, struct r5conf *conf)
 {
        struct stripe_head *sh;
        int i;
@@ -2061,6 +2143,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
                INIT_LIST_HEAD(&sh->r5c);
                INIT_LIST_HEAD(&sh->log_list);
                atomic_set(&sh->count, 1);
+               sh->raid_conf = conf;
                sh->log_start = MaxSector;
                for (i = 0; i < disks; i++) {
                        struct r5dev *dev = &sh->dev[i];
@@ -2068,6 +2151,14 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
                        bio_init(&dev->req, &dev->vec, 1);
                        bio_init(&dev->rreq, &dev->rvec, 1);
                }
+
+               if (raid5_has_ppl(conf)) {
+                       sh->ppl_page = alloc_page(gfp);
+                       if (!sh->ppl_page) {
+                               free_stripe(sc, sh);
+                               sh = NULL;
+                       }
+               }
        }
        return sh;
 }
@@ -2075,15 +2166,13 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
 {
        struct stripe_head *sh;
 
-       sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size);
+       sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
        if (!sh)
                return 0;
 
-       sh->raid_conf = conf;
-
        if (grow_buffers(sh, gfp)) {
                shrink_buffers(sh);
-               kmem_cache_free(conf->slab_cache, sh);
+               free_stripe(conf->slab_cache, sh);
                return 0;
        }
        sh->hash_lock_index =
@@ -2210,7 +2299,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
         *    pages have been transferred over, and the old kmem_cache is
         *    freed when all stripes are done.
         * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
-        *    we simple return a failre status - no need to clean anything up.
+        *    we simple return a failure status - no need to clean anything up.
         * 4/ allocate new pages for the new slots in the new stripe_heads.
         *    If this fails, we don't bother trying the shrink the
         *    stripe_heads down again, we just leave them as they are.
@@ -2228,9 +2317,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        int i;
        int hash, cnt;
 
-       if (newsize <= conf->pool_size)
-               return 0; /* never bother to shrink */
-
        err = md_allow_write(conf->mddev);
        if (err)
                return err;
@@ -2246,11 +2332,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        mutex_lock(&conf->cache_size_mutex);
 
        for (i = conf->max_nr_stripes; i; i--) {
-               nsh = alloc_stripe(sc, GFP_KERNEL, newsize);
+               nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
                if (!nsh)
                        break;
 
-               nsh->raid_conf = conf;
                list_add(&nsh->lru, &newstripes);
        }
        if (i) {
@@ -2258,7 +2343,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
                while (!list_empty(&newstripes)) {
                        nsh = list_entry(newstripes.next, struct stripe_head, lru);
                        list_del(&nsh->lru);
-                       kmem_cache_free(sc, nsh);
+                       free_stripe(sc, nsh);
                }
                kmem_cache_destroy(sc);
                mutex_unlock(&conf->cache_size_mutex);
@@ -2284,7 +2369,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
                        nsh->dev[i].orig_page = osh->dev[i].page;
                }
                nsh->hash_lock_index = hash;
-               kmem_cache_free(conf->slab_cache, osh);
+               free_stripe(conf->slab_cache, osh);
                cnt++;
                if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
                    !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
@@ -2323,6 +2408,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
                err = -ENOMEM;
 
        mutex_unlock(&conf->cache_size_mutex);
+
+       conf->slab_cache = sc;
+       conf->active_name = 1-conf->active_name;
+
        /* Step 4, return new stripes to service */
        while(!list_empty(&newstripes)) {
                nsh = list_entry(newstripes.next, struct stripe_head, lru);
@@ -2340,8 +2429,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        }
        /* critical section pass, GFP_NOIO no longer needed */
 
-       conf->slab_cache = sc;
-       conf->active_name = 1-conf->active_name;
        if (!err)
                conf->pool_size = newsize;
        return err;
@@ -2359,7 +2446,7 @@ static int drop_one_stripe(struct r5conf *conf)
                return 0;
        BUG_ON(atomic_read(&sh->count));
        shrink_buffers(sh);
-       kmem_cache_free(conf->slab_cache, sh);
+       free_stripe(conf->slab_cache, sh);
        atomic_dec(&conf->active_stripes);
        conf->max_nr_stripes--;
        return 1;
@@ -3082,6 +3169,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
                s->locked++;
        }
 
+       if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
+           test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
+           !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
+           test_bit(R5_Insync, &sh->dev[pd_idx].flags))
+               set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
+
        pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
                __func__, (unsigned long long)sh->sector,
                s->locked, s->ops_request);
@@ -3103,14 +3196,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
                (unsigned long long)bi->bi_iter.bi_sector,
                (unsigned long long)sh->sector);
 
-       /*
-        * If several bio share a stripe. The bio bi_phys_segments acts as a
-        * reference count to avoid race. The reference count should already be
-        * increased before this function is called (for example, in
-        * raid5_make_request()), so other bio sharing this stripe will not free the
-        * stripe. If a stripe is owned by one stripe, the stripe lock will
-        * protect it.
-        */
        spin_lock_irq(&sh->stripe_lock);
        /* Don't allow new IO added to stripes in batch list */
        if (sh->batch_head)
@@ -3129,6 +3214,36 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
        if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
                goto overlap;
 
+       if (forwrite && raid5_has_ppl(conf)) {
+               /*
+                * With PPL only writes to consecutive data chunks within a
+                * stripe are allowed because for a single stripe_head we can
+                * only have one PPL entry at a time, which describes one data
+                * range. Not really an overlap, but wait_for_overlap can be
+                * used to handle this.
+                */
+               sector_t sector;
+               sector_t first = 0;
+               sector_t last = 0;
+               int count = 0;
+               int i;
+
+               for (i = 0; i < sh->disks; i++) {
+                       if (i != sh->pd_idx &&
+                           (i == dd_idx || sh->dev[i].towrite)) {
+                               sector = sh->dev[i].sector;
+                               if (count == 0 || sector < first)
+                                       first = sector;
+                               if (sector > last)
+                                       last = sector;
+                               count++;
+                       }
+               }
+
+               if (first + conf->chunk_sectors * (count - 1) != last)
+                       goto overlap;
+       }
+
        if (!forwrite || previous)
                clear_bit(STRIPE_BATCH_READY, &sh->state);
 
@@ -3136,7 +3251,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
        if (*bip)
                bi->bi_next = *bip;
        *bip = bi;
-       raid5_inc_bi_active_stripes(bi);
+       bio_inc_remaining(bi);
+       md_write_inc(conf->mddev, bi);
 
        if (forwrite) {
                /* check if page is covered */
@@ -3213,8 +3329,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 
 static void
 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
-                               struct stripe_head_state *s, int disks,
-                               struct bio_list *return_bi)
+                    struct stripe_head_state *s, int disks)
 {
        int i;
        BUG_ON(sh->batch_head);
@@ -3250,7 +3365,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                if (bi)
                        bitmap_end = 1;
 
-               r5l_stripe_write_finished(sh);
+               log_stripe_write_finished(sh);
 
                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
                        wake_up(&conf->wait_for_overlap);
@@ -3260,10 +3375,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                        struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
 
                        bi->bi_error = -EIO;
-                       if (!raid5_dec_bi_active_stripes(bi)) {
-                               md_write_end(conf->mddev);
-                               bio_list_add(return_bi, bi);
-                       }
+                       md_write_end(conf->mddev);
+                       bio_endio(bi);
                        bi = nextbi;
                }
                if (bitmap_end)
@@ -3284,10 +3397,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                        struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
 
                        bi->bi_error = -EIO;
-                       if (!raid5_dec_bi_active_stripes(bi)) {
-                               md_write_end(conf->mddev);
-                               bio_list_add(return_bi, bi);
-                       }
+                       md_write_end(conf->mddev);
+                       bio_endio(bi);
                        bi = bi2;
                }
 
@@ -3312,8 +3423,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                                        r5_next_bio(bi, sh->dev[i].sector);
 
                                bi->bi_error = -EIO;
-                               if (!raid5_dec_bi_active_stripes(bi))
-                                       bio_list_add(return_bi, bi);
+                               bio_endio(bi);
                                bi = nextbi;
                        }
                }
@@ -3449,7 +3559,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
            !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                /* Pre-reads at not permitted until after short delay
                 * to gather multiple requests.  However if this
-                * device is no Insync, the block could only be be computed
+                * device is no Insync, the block could only be computed
                 * and there is no need to delay that.
                 */
                return 0;
@@ -3468,7 +3578,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
 
        /* If we are forced to do a reconstruct-write, either because
         * the current RAID6 implementation only supports that, or
-        * or because parity cannot be trusted and we are currently
+        * because parity cannot be trusted and we are currently
         * recovering it, there is extra need to be careful.
         * If one of the devices that we would need to read, because
         * it is not being overwritten (and maybe not written at all)
@@ -3508,9 +3618,20 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
                BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
                BUG_ON(test_bit(R5_Wantread, &dev->flags));
                BUG_ON(sh->batch_head);
+
+               /*
+                * In the raid6 case if the only non-uptodate disk is P
+                * then we already trusted P to compute the other failed
+                * drives. It is safe to compute rather than re-read P.
+                * In other cases we only compute blocks from failed
+                * devices, otherwise check/repair might fail to detect
+                * a real inconsistency.
+                */
+
                if ((s->uptodate == disks - 1) &&
+                   ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
                    (s->failed && (disk_idx == s->failed_num[0] ||
-                                  disk_idx == s->failed_num[1]))) {
+                                  disk_idx == s->failed_num[1])))) {
                        /* have disk failed, and we're requested to fetch it;
                         * do compute it
                         */
@@ -3612,7 +3733,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
  * never LOCKED, so we don't need to test 'failed' directly.
  */
 static void handle_stripe_clean_event(struct r5conf *conf,
-       struct stripe_head *sh, int disks, struct bio_list *return_bi)
+       struct stripe_head *sh, int disks)
 {
        int i;
        struct r5dev *dev;
@@ -3644,10 +3765,8 @@ returnbi:
                                while (wbi && wbi->bi_iter.bi_sector <
                                        dev->sector + STRIPE_SECTORS) {
                                        wbi2 = r5_next_bio(wbi, dev->sector);
-                                       if (!raid5_dec_bi_active_stripes(wbi)) {
-                                               md_write_end(conf->mddev);
-                                               bio_list_add(return_bi, wbi);
-                                       }
+                                       md_write_end(conf->mddev);
+                                       bio_endio(wbi);
                                        wbi = wbi2;
                                }
                                bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -3669,7 +3788,7 @@ returnbi:
                                discard_pending = 1;
                }
 
-       r5l_stripe_write_finished(sh);
+       log_stripe_write_finished(sh);
 
        if (!discard_pending &&
            test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
@@ -4556,7 +4675,8 @@ static void handle_stripe(struct stripe_head *sh)
        if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
                goto finish;
 
-       if (s.handle_bad_blocks) {
+       if (s.handle_bad_blocks ||
+           test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
                set_bit(STRIPE_HANDLE, &sh->state);
                goto finish;
        }
@@ -4589,7 +4709,7 @@ static void handle_stripe(struct stripe_head *sh)
                sh->reconstruct_state = 0;
                break_stripe_batch_list(sh, 0);
                if (s.to_read+s.to_write+s.written)
-                       handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
+                       handle_failed_stripe(conf, sh, &s, disks);
                if (s.syncing + s.replacing)
                        handle_failed_sync(conf, sh, &s);
        }
@@ -4655,11 +4775,11 @@ static void handle_stripe(struct stripe_head *sh)
                             && !test_bit(R5_LOCKED, &qdev->flags)
                             && (test_bit(R5_UPTODATE, &qdev->flags) ||
                                 test_bit(R5_Discard, &qdev->flags))))))
-               handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+               handle_stripe_clean_event(conf, sh, disks);
 
        if (s.just_cached)
-               r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
-       r5l_stripe_write_finished(sh);
+               r5c_handle_cached_data_endio(conf, sh, disks);
+       log_stripe_write_finished(sh);
 
        /* Now we might consider reading some blocks, either to check/generate
         * parity, or to satisfy requests
@@ -4886,16 +5006,6 @@ finish:
                        md_wakeup_thread(conf->mddev->thread);
        }
 
-       if (!bio_list_empty(&s.return_bi)) {
-               if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
-                       spin_lock_irq(&conf->device_lock);
-                       bio_list_merge(&conf->return_bi, &s.return_bi);
-                       spin_unlock_irq(&conf->device_lock);
-                       md_wakeup_thread(conf->mddev->thread);
-               } else
-                       return_io(&s.return_bi);
-       }
-
        clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
 }
 
@@ -4984,12 +5094,14 @@ static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
        md_wakeup_thread(conf->mddev->thread);
 }
 
-static struct bio *remove_bio_from_retry(struct r5conf *conf)
+static struct bio *remove_bio_from_retry(struct r5conf *conf,
+                                        unsigned int *offset)
 {
        struct bio *bi;
 
        bi = conf->retry_read_aligned;
        if (bi) {
+               *offset = conf->retry_read_offset;
                conf->retry_read_aligned = NULL;
                return bi;
        }
@@ -4997,11 +5109,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
        if(bi) {
                conf->retry_read_aligned_list = bi->bi_next;
                bi->bi_next = NULL;
-               /*
-                * this sets the active strip count to 1 and the processed
-                * strip count to zero (upper 8 bits)
-                */
-               raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
+               *offset = 0;
        }
 
        return bi;
@@ -5136,24 +5244,20 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
 {
        struct bio *split;
+       sector_t sector = raid_bio->bi_iter.bi_sector;
+       unsigned chunk_sects = mddev->chunk_sectors;
+       unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
 
-       do {
-               sector_t sector = raid_bio->bi_iter.bi_sector;
-               unsigned chunk_sects = mddev->chunk_sectors;
-               unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
-
-               if (sectors < bio_sectors(raid_bio)) {
-                       split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
-                       bio_chain(split, raid_bio);
-               } else
-                       split = raid_bio;
+       if (sectors < bio_sectors(raid_bio)) {
+               struct r5conf *conf = mddev->private;
+               split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split);
+               bio_chain(split, raid_bio);
+               generic_make_request(raid_bio);
+               raid_bio = split;
+       }
 
-               if (!raid5_read_one_chunk(mddev, split)) {
-                       if (split != raid_bio)
-                               generic_make_request(raid_bio);
-                       return split;
-               }
-       } while (split != raid_bio);
+       if (!raid5_read_one_chunk(mddev, raid_bio))
+               return raid_bio;
 
        return NULL;
 }
@@ -5170,19 +5274,27 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
  */
 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 {
-       struct stripe_head *sh = NULL, *tmp;
+       struct stripe_head *sh, *tmp;
        struct list_head *handle_list = NULL;
-       struct r5worker_group *wg = NULL;
+       struct r5worker_group *wg;
+       bool second_try = !r5c_is_writeback(conf->log);
+       bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state);
 
+again:
+       wg = NULL;
+       sh = NULL;
        if (conf->worker_cnt_per_group == 0) {
-               handle_list = &conf->handle_list;
+               handle_list = try_loprio ? &conf->loprio_list :
+                                       &conf->handle_list;
        } else if (group != ANY_GROUP) {
-               handle_list = &conf->worker_groups[group].handle_list;
+               handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
+                               &conf->worker_groups[group].handle_list;
                wg = &conf->worker_groups[group];
        } else {
                int i;
                for (i = 0; i < conf->group_cnt; i++) {
-                       handle_list = &conf->worker_groups[i].handle_list;
+                       handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
+                               &conf->worker_groups[i].handle_list;
                        wg = &conf->worker_groups[i];
                        if (!list_empty(handle_list))
                                break;
@@ -5233,8 +5345,13 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
                wg = NULL;
        }
 
-       if (!sh)
-               return NULL;
+       if (!sh) {
+               if (second_try)
+                       return NULL;
+               second_try = true;
+               try_loprio = !try_loprio;
+               goto again;
+       }
 
        if (wg) {
                wg->stripes_cnt--;
@@ -5323,7 +5440,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
        struct r5conf *conf = mddev->private;
        sector_t logical_sector, last_sector;
        struct stripe_head *sh;
-       int remaining;
        int stripe_sectors;
 
        if (mddev->reshape_position != MaxSector)
@@ -5334,7 +5450,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
        last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
 
        bi->bi_next = NULL;
-       bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+       md_write_start(mddev, bi);
 
        stripe_sectors = conf->chunk_sectors *
                (conf->raid_disks - conf->max_degraded);
@@ -5380,7 +5496,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
                                continue;
                        sh->dev[d].towrite = bi;
                        set_bit(R5_OVERWRITE, &sh->dev[d].flags);
-                       raid5_inc_bi_active_stripes(bi);
+                       bio_inc_remaining(bi);
+                       md_write_inc(mddev, bi);
                        sh->overwrite_disks++;
                }
                spin_unlock_irq(&sh->stripe_lock);
@@ -5403,11 +5520,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
                release_stripe_plug(mddev, sh);
        }
 
-       remaining = raid5_dec_bi_active_stripes(bi);
-       if (remaining == 0) {
-               md_write_end(mddev);
-               bio_endio(bi);
-       }
+       md_write_end(mddev);
+       bio_endio(bi);
 }
 
 static void raid5_make_request(struct mddev *mddev, struct bio * bi)
@@ -5418,7 +5532,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
        sector_t logical_sector, last_sector;
        struct stripe_head *sh;
        const int rw = bio_data_dir(bi);
-       int remaining;
        DEFINE_WAIT(w);
        bool do_prepare;
        bool do_flush = false;
@@ -5440,8 +5553,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
                do_flush = bi->bi_opf & REQ_PREFLUSH;
        }
 
-       md_write_start(mddev, bi);
-
        /*
         * If array is degraded, better not do chunk aligned read because
         * later we might have to read it again in order to reconstruct
@@ -5462,7 +5573,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
        logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
        last_sector = bio_end_sector(bi);
        bi->bi_next = NULL;
-       bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
+       md_write_start(mddev, bi);
 
        prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
        for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
@@ -5597,16 +5708,9 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
        }
        finish_wait(&conf->wait_for_overlap, &w);
 
-       remaining = raid5_dec_bi_active_stripes(bi);
-       if (remaining == 0) {
-
-               if ( rw == WRITE )
-                       md_write_end(mddev);
-
-               trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
-                                        bi, 0);
-               bio_endio(bi);
-       }
+       if (rw == WRITE)
+               md_write_end(mddev);
+       bio_endio(bi);
 }
 
 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
@@ -5955,7 +6059,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
        return STRIPE_SECTORS;
 }
 
-static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
+static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
+                              unsigned int offset)
 {
        /* We may not be able to submit a whole bio at once as there
         * may not be enough stripe_heads available.
@@ -5971,7 +6076,6 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
        int dd_idx;
        sector_t sector, logical_sector, last_sector;
        int scnt = 0;
-       int remaining;
        int handled = 0;
 
        logical_sector = raid_bio->bi_iter.bi_sector &
@@ -5985,7 +6089,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
                     sector += STRIPE_SECTORS,
                     scnt++) {
 
-               if (scnt < raid5_bi_processed_stripes(raid_bio))
+               if (scnt < offset)
                        /* already done this stripe */
                        continue;
 
@@ -5993,15 +6097,15 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 
                if (!sh) {
                        /* failed to get a stripe - must wait */
-                       raid5_set_bi_processed_stripes(raid_bio, scnt);
                        conf->retry_read_aligned = raid_bio;
+                       conf->retry_read_offset = scnt;
                        return handled;
                }
 
                if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
                        raid5_release_stripe(sh);
-                       raid5_set_bi_processed_stripes(raid_bio, scnt);
                        conf->retry_read_aligned = raid_bio;
+                       conf->retry_read_offset = scnt;
                        return handled;
                }
 
@@ -6010,12 +6114,9 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
                raid5_release_stripe(sh);
                handled++;
        }
-       remaining = raid5_dec_bi_active_stripes(raid_bio);
-       if (remaining == 0) {
-               trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
-                                        raid_bio, 0);
-               bio_endio(raid_bio);
-       }
+
+       bio_endio(raid_bio);
+
        if (atomic_dec_and_test(&conf->active_aligned_reads))
                wake_up(&conf->wait_for_quiescent);
        return handled;
@@ -6058,7 +6159,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
 
        for (i = 0; i < batch_size; i++)
                handle_stripe(batch[i]);
-       r5l_write_stripe_run(conf->log);
+       log_write_stripe_run(conf);
 
        cond_resched();
 
@@ -6075,6 +6176,7 @@ static void raid5_do_work(struct work_struct *work)
        struct r5worker *worker = container_of(work, struct r5worker, work);
        struct r5worker_group *group = worker->group;
        struct r5conf *conf = group->conf;
+       struct mddev *mddev = conf->mddev;
        int group_id = group - conf->worker_groups;
        int handled;
        struct blk_plug plug;
@@ -6095,6 +6197,9 @@ static void raid5_do_work(struct work_struct *work)
                if (!batch_size && !released)
                        break;
                handled += batch_size;
+               wait_event_lock_irq(mddev->sb_wait,
+                       !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
+                       conf->device_lock);
        }
        pr_debug("%d stripes handled\n", handled);
 
@@ -6122,24 +6227,13 @@ static void raid5d(struct md_thread *thread)
 
        md_check_recovery(mddev);
 
-       if (!bio_list_empty(&conf->return_bi) &&
-           !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-               struct bio_list tmp = BIO_EMPTY_LIST;
-               spin_lock_irq(&conf->device_lock);
-               if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-                       bio_list_merge(&tmp, &conf->return_bi);
-                       bio_list_init(&conf->return_bi);
-               }
-               spin_unlock_irq(&conf->device_lock);
-               return_io(&tmp);
-       }
-
        blk_start_plug(&plug);
        handled = 0;
        spin_lock_irq(&conf->device_lock);
        while (1) {
                struct bio *bio;
                int batch_size, released;
+               unsigned int offset;
 
                released = release_stripe_list(conf, conf->temp_inactive_list);
                if (released)
@@ -6157,10 +6251,10 @@ static void raid5d(struct md_thread *thread)
                }
                raid5_activate_delayed(conf);
 
-               while ((bio = remove_bio_from_retry(conf))) {
+               while ((bio = remove_bio_from_retry(conf, &offset))) {
                        int ok;
                        spin_unlock_irq(&conf->device_lock);
-                       ok = retry_aligned_read(conf, bio);
+                       ok = retry_aligned_read(conf, bio, offset);
                        spin_lock_irq(&conf->device_lock);
                        if (!ok)
                                break;
@@ -6544,6 +6638,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
 
                group = &(*worker_groups)[i];
                INIT_LIST_HEAD(&group->handle_list);
+               INIT_LIST_HEAD(&group->loprio_list);
                group->conf = conf;
                group->workers = workers + i * cnt;
 
@@ -6634,8 +6729,8 @@ static void free_conf(struct r5conf *conf)
 {
        int i;
 
-       if (conf->log)
-               r5l_exit_log(conf->log);
+       log_exit(conf);
+
        if (conf->shrinker.nr_deferred)
                unregister_shrinker(&conf->shrinker);
 
@@ -6646,7 +6741,10 @@ static void free_conf(struct r5conf *conf)
                if (conf->disks[i].extra_page)
                        put_page(conf->disks[i].extra_page);
        kfree(conf->disks);
+       if (conf->bio_split)
+               bioset_free(conf->bio_split);
        kfree(conf->stripe_hashtbl);
+       kfree(conf->pending_data);
        kfree(conf);
 }
 
@@ -6756,6 +6854,14 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
        if (conf == NULL)
                goto abort;
+       INIT_LIST_HEAD(&conf->free_list);
+       INIT_LIST_HEAD(&conf->pending_list);
+       conf->pending_data = kzalloc(sizeof(struct r5pending_data) *
+               PENDING_IO_MAX, GFP_KERNEL);
+       if (!conf->pending_data)
+               goto abort;
+       for (i = 0; i < PENDING_IO_MAX; i++)
+               list_add(&conf->pending_data[i].sibling, &conf->free_list);
        /* Don't enable multi-threading by default*/
        if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
                                 &new_group)) {
@@ -6771,15 +6877,14 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        init_waitqueue_head(&conf->wait_for_stripe);
        init_waitqueue_head(&conf->wait_for_overlap);
        INIT_LIST_HEAD(&conf->handle_list);
+       INIT_LIST_HEAD(&conf->loprio_list);
        INIT_LIST_HEAD(&conf->hold_list);
        INIT_LIST_HEAD(&conf->delayed_list);
        INIT_LIST_HEAD(&conf->bitmap_list);
-       bio_list_init(&conf->return_bi);
        init_llist_head(&conf->released_stripes);
        atomic_set(&conf->active_stripes, 0);
        atomic_set(&conf->preread_active_stripes, 0);
        atomic_set(&conf->active_aligned_reads, 0);
-       bio_list_init(&conf->pending_bios);
        spin_lock_init(&conf->pending_bios_lock);
        conf->batch_bio_dispatch = true;
        rdev_for_each(rdev, mddev) {
@@ -6813,6 +6918,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
                        goto abort;
        }
 
+       conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+       if (!conf->bio_split)
+               goto abort;
        conf->mddev = mddev;
 
        if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
@@ -7097,6 +7205,13 @@ static int raid5_run(struct mddev *mddev)
                BUG_ON(mddev->delta_disks != 0);
        }
 
+       if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
+           test_bit(MD_HAS_PPL, &mddev->flags)) {
+               pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
+                       mdname(mddev));
+               clear_bit(MD_HAS_PPL, &mddev->flags);
+       }
+
        if (mddev->private == NULL)
                conf = setup_conf(mddev);
        else
@@ -7188,7 +7303,10 @@ static int raid5_run(struct mddev *mddev)
 
        if (mddev->degraded > dirty_parity_disks &&
            mddev->recovery_cp != MaxSector) {
-               if (mddev->ok_start_degraded)
+               if (test_bit(MD_HAS_PPL, &mddev->flags))
+                       pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
+                               mdname(mddev));
+               else if (mddev->ok_start_degraded)
                        pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
                                mdname(mddev));
                else {
@@ -7254,14 +7372,6 @@ static int raid5_run(struct mddev *mddev)
                mddev->queue->limits.discard_alignment = stripe;
                mddev->queue->limits.discard_granularity = stripe;
 
-               /*
-                * We use 16-bit counter of active stripes in bi_phys_segments
-                * (minus one for over-loaded initialization)
-                */
-               blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
-               blk_queue_max_discard_sectors(mddev->queue,
-                                             0xfffe * STRIPE_SECTORS);
-
                blk_queue_max_write_same_sectors(mddev->queue, 0);
                blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
 
@@ -7299,14 +7409,8 @@ static int raid5_run(struct mddev *mddev)
                blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
        }
 
-       if (journal_dev) {
-               char b[BDEVNAME_SIZE];
-
-               pr_debug("md/raid:%s: using device %s as journal\n",
-                        mdname(mddev), bdevname(journal_dev->bdev, b));
-               if (r5l_init_log(conf, journal_dev))
-                       goto abort;
-       }
+       if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
+               goto abort;
 
        return 0;
 abort:
@@ -7420,17 +7524,16 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 
        print_raid5_conf(conf);
        if (test_bit(Journal, &rdev->flags) && conf->log) {
-               struct r5l_log *log;
                /*
                 * we can't wait pending write here, as this is called in
                 * raid5d, wait will deadlock.
+                * neilb: there is no locking about new writes here,
+                * so this cannot be safe.
                 */
-               if (atomic_read(&mddev->writes_pending))
+               if (atomic_read(&conf->active_stripes)) {
                        return -EBUSY;
-               log = conf->log;
-               conf->log = NULL;
-               synchronize_rcu();
-               r5l_exit_log(log);
+               }
+               log_exit(conf);
                return 0;
        }
        if (rdev == p->rdev)
@@ -7469,6 +7572,11 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
                        *rdevp = rdev;
                }
        }
+       if (!err) {
+               err = log_modify(conf, rdev, false);
+               if (err)
+                       goto abort;
+       }
        if (p->replacement) {
                /* We must have just cleared 'rdev' */
                p->rdev = p->replacement;
@@ -7477,12 +7585,12 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
                           * but will never see neither - if they are careful
                           */
                p->replacement = NULL;
-               clear_bit(WantReplacement, &rdev->flags);
-       } else
-               /* We might have just removed the Replacement as faulty-
-                * clear the bit just in case
-                */
-               clear_bit(WantReplacement, &rdev->flags);
+
+               if (!err)
+                       err = log_modify(conf, p->rdev, true);
+       }
+
+       clear_bit(WantReplacement, &rdev->flags);
 abort:
 
        print_raid5_conf(conf);
@@ -7499,7 +7607,6 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
        int last = conf->raid_disks - 1;
 
        if (test_bit(Journal, &rdev->flags)) {
-               char b[BDEVNAME_SIZE];
                if (conf->log)
                        return -EBUSY;
 
@@ -7508,9 +7615,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
                 * The array is in readonly mode if journal is missing, so no
                 * write requests running. We should be safe
                 */
-               r5l_init_log(conf, rdev);
-               pr_debug("md/raid:%s: using device %s as journal\n",
-                        mdname(mddev), bdevname(rdev->bdev, b));
+               log_init(conf, rdev, false);
                return 0;
        }
        if (mddev->recovery_disabled == conf->recovery_disabled)
@@ -7537,10 +7642,12 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
                if (p->rdev == NULL) {
                        clear_bit(In_sync, &rdev->flags);
                        rdev->raid_disk = disk;
-                       err = 0;
                        if (rdev->saved_raid_disk != disk)
                                conf->fullsync = 1;
                        rcu_assign_pointer(p->rdev, rdev);
+
+                       err = log_modify(conf, rdev, true);
+
                        goto out;
                }
        }
@@ -7574,7 +7681,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
        sector_t newsize;
        struct r5conf *conf = mddev->private;
 
-       if (conf->log)
+       if (conf->log || raid5_has_ppl(conf))
                return -EINVAL;
        sectors &= ~((sector_t)conf->chunk_sectors - 1);
        newsize = raid5_size(mddev, sectors, mddev->raid_disks);
@@ -7625,7 +7732,7 @@ static int check_reshape(struct mddev *mddev)
 {
        struct r5conf *conf = mddev->private;
 
-       if (conf->log)
+       if (conf->log || raid5_has_ppl(conf))
                return -EINVAL;
        if (mddev->delta_disks == 0 &&
            mddev->new_layout == mddev->layout &&
@@ -7658,6 +7765,9 @@ static int check_reshape(struct mddev *mddev)
                                      mddev->chunk_sectors)
                            ) < 0)
                        return -ENOMEM;
+
+       if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
+               return 0; /* never bother to shrink */
        return resize_stripes(conf, (conf->previous_raid_disks
                                     + mddev->delta_disks));
 }
@@ -8148,6 +8258,68 @@ static void *raid6_takeover(struct mddev *mddev)
        return setup_conf(mddev);
 }
 
+static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
+{
+       struct r5conf *conf;
+       int err;
+
+       err = mddev_lock(mddev);
+       if (err)
+               return err;
+       conf = mddev->private;
+       if (!conf) {
+               mddev_unlock(mddev);
+               return -ENODEV;
+       }
+
+       if (strncmp(buf, "ppl", 3) == 0) {
+               /* ppl only works with RAID 5 */
+               if (!raid5_has_ppl(conf) && conf->level == 5) {
+                       err = log_init(conf, NULL, true);
+                       if (!err) {
+                               err = resize_stripes(conf, conf->pool_size);
+                               if (err)
+                                       log_exit(conf);
+                       }
+               } else
+                       err = -EINVAL;
+       } else if (strncmp(buf, "resync", 6) == 0) {
+               if (raid5_has_ppl(conf)) {
+                       mddev_suspend(mddev);
+                       log_exit(conf);
+                       mddev_resume(mddev);
+                       err = resize_stripes(conf, conf->pool_size);
+               } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
+                          r5l_log_disk_error(conf)) {
+                       bool journal_dev_exists = false;
+                       struct md_rdev *rdev;
+
+                       rdev_for_each(rdev, mddev)
+                               if (test_bit(Journal, &rdev->flags)) {
+                                       journal_dev_exists = true;
+                                       break;
+                               }
+
+                       if (!journal_dev_exists) {
+                               mddev_suspend(mddev);
+                               clear_bit(MD_HAS_JOURNAL, &mddev->flags);
+                               mddev_resume(mddev);
+                       } else  /* need remove journal device first */
+                               err = -EBUSY;
+               } else
+                       err = -EINVAL;
+       } else {
+               err = -EINVAL;
+       }
+
+       if (!err)
+               md_update_sb(mddev, 1);
+
+       mddev_unlock(mddev);
+
+       return err;
+}
+
 static struct md_personality raid6_personality =
 {
        .name           = "raid6",
@@ -8170,6 +8342,7 @@ static struct md_personality raid6_personality =
        .quiesce        = raid5_quiesce,
        .takeover       = raid6_takeover,
        .congested      = raid5_congested,
+       .change_consistency_policy = raid5_change_consistency_policy,
 };
 static struct md_personality raid5_personality =
 {
@@ -8193,6 +8366,7 @@ static struct md_personality raid5_personality =
        .quiesce        = raid5_quiesce,
        .takeover       = raid5_takeover,
        .congested      = raid5_congested,
+       .change_consistency_policy = raid5_change_consistency_policy,
 };
 
 static struct md_personality raid4_personality =
@@ -8217,6 +8391,7 @@ static struct md_personality raid4_personality =
        .quiesce        = raid5_quiesce,
        .takeover       = raid4_takeover,
        .congested      = raid5_congested,
+       .change_consistency_policy = raid5_change_consistency_policy,
 };
 
 static int __init raid5_init(void)