Merge branch 'for-linus-4.12-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...

[sfrench/cifs-2.6.git] / drivers / md / raid5.c
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 2efdb0d6746074a0416f18b0c0dfbab7cc5be6a8..2e38cfac5b1dc5a318f66b4bb6e2e2195797ad53 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -58,11 +58,13 @@
  #include <linux/sched/signal.h>
  
  #include <trace/events/block.h>
+#include <linux/list_sort.h>
  
  #include "md.h"
  #include "raid5.h"
  #include "raid0.h"
  #include "bitmap.h"
+#include "raid5-log.h"
  
  #define UNSUPPORTED_MDDEV_FLAGS        (1L << MD_FAILFAST_SUPPORTED)
  
@@ -156,17 +158,6 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
         return slot;
  }
  
-static void return_io(struct bio_list *return_bi)
-{
-       struct bio *bi;
-       while ((bi = bio_list_pop(return_bi)) != NULL) {
-               bi->bi_iter.bi_size = 0;
-               trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
-                                        bi, 0);
-               bio_endio(bi);
-       }
-}
-
  static void print_raid5_conf (struct r5conf *conf);
  
  static int stripe_operations_active(struct stripe_head *sh)
@@ -176,6 +167,13 @@ static int stripe_operations_active(struct stripe_head *sh)
                test_bit(STRIPE_COMPUTE_RUN, &sh->state);
  }
  
+static bool stripe_is_lowprio(struct stripe_head *sh)
+{
+       return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
+               test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
+              !test_bit(STRIPE_R5C_CACHING, &sh->state);
+}
+
  static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
  {
         struct r5conf *conf = sh->raid_conf;
@@ -191,7 +189,10 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
         if (list_empty(&sh->lru)) {
                 struct r5worker_group *group;
                 group = conf->worker_groups + cpu_to_group(cpu);
-               list_add_tail(&sh->lru, &group->handle_list);
+               if (stripe_is_lowprio(sh))
+                       list_add_tail(&sh->lru, &group->loprio_list);
+               else
+                       list_add_tail(&sh->lru, &group->handle_list);
                 group->stripes_cnt++;
                 sh->group = group;
         }
@@ -254,7 +255,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
                         clear_bit(STRIPE_DELAYED, &sh->state);
                         clear_bit(STRIPE_BIT_DELAY, &sh->state);
                         if (conf->worker_cnt_per_group == 0) {
-                               list_add_tail(&sh->lru, &conf->handle_list);
+                               if (stripe_is_lowprio(sh))
+                                       list_add_tail(&sh->lru,
+                                                       &conf->loprio_list);
+                               else
+                                       list_add_tail(&sh->lru,
+                                                       &conf->handle_list);
                         } else {
                                 raid5_wakeup_stripe_thread(sh);
                                 return;
@@ -481,6 +487,7 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
                 sh->dev[i].page = page;
                 sh->dev[i].orig_page = page;
         }
+
         return 0;
  }
  
@@ -729,7 +736,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
  {
         struct r5conf *conf = sh->raid_conf;
  
-       if (conf->log)
+       if (conf->log || raid5_has_ppl(conf))
                 return false;
         return test_bit(STRIPE_BATCH_READY, &sh->state) &&
                 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
@@ -863,41 +870,107 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
         return 1;
  }
  
-static void flush_deferred_bios(struct r5conf *conf)
+static void dispatch_bio_list(struct bio_list *tmp)
  {
-       struct bio_list tmp;
         struct bio *bio;
  
-       if (!conf->batch_bio_dispatch || !conf->group_cnt)
+       while ((bio = bio_list_pop(tmp)))
+               generic_make_request(bio);
+}
+
+static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
+{
+       const struct r5pending_data *da = list_entry(a,
+                               struct r5pending_data, sibling);
+       const struct r5pending_data *db = list_entry(b,
+                               struct r5pending_data, sibling);
+       if (da->sector > db->sector)
+               return 1;
+       if (da->sector < db->sector)
+               return -1;
+       return 0;
+}
+
+static void dispatch_defer_bios(struct r5conf *conf, int target,
+                               struct bio_list *list)
+{
+       struct r5pending_data *data;
+       struct list_head *first, *next = NULL;
+       int cnt = 0;
+
+       if (conf->pending_data_cnt == 0)
+               return;
+
+       list_sort(NULL, &conf->pending_list, cmp_stripe);
+
+       first = conf->pending_list.next;
+
+       /* temporarily move the head */
+       if (conf->next_pending_data)
+               list_move_tail(&conf->pending_list,
+                               &conf->next_pending_data->sibling);
+
+       while (!list_empty(&conf->pending_list)) {
+               data = list_first_entry(&conf->pending_list,
+                       struct r5pending_data, sibling);
+               if (&data->sibling == first)
+                       first = data->sibling.next;
+               next = data->sibling.next;
+
+               bio_list_merge(list, &data->bios);
+               list_move(&data->sibling, &conf->free_list);
+               cnt++;
+               if (cnt >= target)
+                       break;
+       }
+       conf->pending_data_cnt -= cnt;
+       BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
+
+       if (next != &conf->pending_list)
+               conf->next_pending_data = list_entry(next,
+                               struct r5pending_data, sibling);
+       else
+               conf->next_pending_data = NULL;
+       /* list isn't empty */
+       if (first != &conf->pending_list)
+               list_move_tail(&conf->pending_list, first);
+}
+
+static void flush_deferred_bios(struct r5conf *conf)
+{
+       struct bio_list tmp = BIO_EMPTY_LIST;
+
+       if (conf->pending_data_cnt == 0)
                 return;
  
-       bio_list_init(&tmp);
         spin_lock(&conf->pending_bios_lock);
-       bio_list_merge(&tmp, &conf->pending_bios);
-       bio_list_init(&conf->pending_bios);
+       dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
+       BUG_ON(conf->pending_data_cnt != 0);
         spin_unlock(&conf->pending_bios_lock);
  
-       while ((bio = bio_list_pop(&tmp)))
-               generic_make_request(bio);
+       dispatch_bio_list(&tmp);
  }
  
-static void defer_bio_issue(struct r5conf *conf, struct bio *bio)
+static void defer_issue_bios(struct r5conf *conf, sector_t sector,
+                               struct bio_list *bios)
  {
-       /*
-        * change group_cnt will drain all bios, so this is safe
-        *
-        * A read generally means a read-modify-write, which usually means a
-        * randwrite, so we don't delay it
-        */
-       if (!conf->batch_bio_dispatch || !conf->group_cnt ||
-           bio_op(bio) == REQ_OP_READ) {
-               generic_make_request(bio);
-               return;
-       }
+       struct bio_list tmp = BIO_EMPTY_LIST;
+       struct r5pending_data *ent;
+
         spin_lock(&conf->pending_bios_lock);
-       bio_list_add(&conf->pending_bios, bio);
+       ent = list_first_entry(&conf->free_list, struct r5pending_data,
+                                                       sibling);
+       list_move_tail(&ent->sibling, &conf->pending_list);
+       ent->sector = sector;
+       bio_list_init(&ent->bios);
+       bio_list_merge(&ent->bios, bios);
+       conf->pending_data_cnt++;
+       if (conf->pending_data_cnt >= PENDING_IO_MAX)
+               dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
+
         spin_unlock(&conf->pending_bios_lock);
-       md_wakeup_thread(conf->mddev->thread);
+
+       dispatch_bio_list(&tmp);
  }
  
  static void
@@ -910,21 +983,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
         struct r5conf *conf = sh->raid_conf;
         int i, disks = sh->disks;
         struct stripe_head *head_sh = sh;
+       struct bio_list pending_bios = BIO_EMPTY_LIST;
+       bool should_defer;
  
         might_sleep();
  
-       if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
-               /* writing out phase */
-               if (s->waiting_extra_page)
-                       return;
-               if (r5l_write_stripe(conf->log, sh) == 0)
-                       return;
-       } else {  /* caching phase */
-               if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
-                       r5c_cache_data(conf->log, sh, s);
-                       return;
-               }
-       }
+       if (log_stripe(sh, s) == 0)
+               return;
+
+       should_defer = conf->batch_bio_dispatch && conf->group_cnt;
  
         for (i = disks; i--; ) {
                 int op, op_flags = 0;
@@ -1080,7 +1147,10 @@ again:
                                 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
                                                       bi, disk_devt(conf->mddev->gendisk),
                                                       sh->dev[i].sector);
-                       defer_bio_issue(conf, bi);
+                       if (should_defer && op_is_write(op))
+                               bio_list_add(&pending_bios, bi);
+                       else
+                               generic_make_request(bi);
                 }
                 if (rrdev) {
                         if (s->syncing || s->expanding || s->expanded
@@ -1125,7 +1195,10 @@ again:
                                 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
                                                       rbi, disk_devt(conf->mddev->gendisk),
                                                       sh->dev[i].sector);
-                       defer_bio_issue(conf, rbi);
+                       if (should_defer && op_is_write(op))
+                               bio_list_add(&pending_bios, rbi);
+                       else
+                               generic_make_request(rbi);
                 }
                 if (!rdev && !rrdev) {
                         if (op_is_write(op))
@@ -1143,6 +1216,9 @@ again:
                 if (sh != head_sh)
                         goto again;
         }
+
+       if (should_defer && !bio_list_empty(&pending_bios))
+               defer_issue_bios(conf, head_sh->sector, &pending_bios);
  }
  
  static struct dma_async_tx_descriptor *
@@ -1212,7 +1288,6 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
  static void ops_complete_biofill(void *stripe_head_ref)
  {
         struct stripe_head *sh = stripe_head_ref;
-       struct bio_list return_bi = BIO_EMPTY_LIST;
         int i;
  
         pr_debug("%s: stripe %llu\n", __func__,
@@ -1236,16 +1311,13 @@ static void ops_complete_biofill(void *stripe_head_ref)
                         while (rbi && rbi->bi_iter.bi_sector <
                                 dev->sector + STRIPE_SECTORS) {
                                 rbi2 = r5_next_bio(rbi, dev->sector);
-                               if (!raid5_dec_bi_active_stripes(rbi))
-                                       bio_list_add(&return_bi, rbi);
+                               bio_endio(rbi);
                                 rbi = rbi2;
                         }
                 }
         }
         clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
  
-       return_io(&return_bi);
-
         set_bit(STRIPE_HANDLE, &sh->state);
         raid5_release_stripe(sh);
  }
@@ -2014,6 +2086,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
                         tx = ops_run_prexor6(sh, percpu, tx);
         }
  
+       if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+               tx = ops_run_partial_parity(sh, percpu, tx);
+
         if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
                 tx = ops_run_biodrain(sh, tx);
                 overlap_clear++;
@@ -2046,8 +2121,15 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
         put_cpu();
  }
  
+static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
+{
+       if (sh->ppl_page)
+               __free_page(sh->ppl_page);
+       kmem_cache_free(sc, sh);
+}
+
  static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
-       int disks)
+       int disks, struct r5conf *conf)
  {
         struct stripe_head *sh;
         int i;
@@ -2061,6 +2143,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
                 INIT_LIST_HEAD(&sh->r5c);
                 INIT_LIST_HEAD(&sh->log_list);
                 atomic_set(&sh->count, 1);
+               sh->raid_conf = conf;
                 sh->log_start = MaxSector;
                 for (i = 0; i < disks; i++) {
                         struct r5dev *dev = &sh->dev[i];
@@ -2068,6 +2151,14 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
                         bio_init(&dev->req, &dev->vec, 1);
                         bio_init(&dev->rreq, &dev->rvec, 1);
                 }
+
+               if (raid5_has_ppl(conf)) {
+                       sh->ppl_page = alloc_page(gfp);
+                       if (!sh->ppl_page) {
+                               free_stripe(sc, sh);
+                               sh = NULL;
+                       }
+               }
         }
         return sh;
  }
@@ -2075,15 +2166,13 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
  {
         struct stripe_head *sh;
  
-       sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size);
+       sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
         if (!sh)
                 return 0;
  
-       sh->raid_conf = conf;
-
         if (grow_buffers(sh, gfp)) {
                 shrink_buffers(sh);
-               kmem_cache_free(conf->slab_cache, sh);
+               free_stripe(conf->slab_cache, sh);
                 return 0;
         }
         sh->hash_lock_index =
@@ -2210,7 +2299,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
          *    pages have been transferred over, and the old kmem_cache is
          *    freed when all stripes are done.
          * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
-        *    we simple return a failre status - no need to clean anything up.
+        *    we simple return a failure status - no need to clean anything up.
          * 4/ allocate new pages for the new slots in the new stripe_heads.
          *    If this fails, we don't bother trying the shrink the
          *    stripe_heads down again, we just leave them as they are.
@@ -2228,9 +2317,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
         int i;
         int hash, cnt;
  
-       if (newsize <= conf->pool_size)
-               return 0; /* never bother to shrink */
-
         err = md_allow_write(conf->mddev);
         if (err)
                 return err;
@@ -2246,11 +2332,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
         mutex_lock(&conf->cache_size_mutex);
  
         for (i = conf->max_nr_stripes; i; i--) {
-               nsh = alloc_stripe(sc, GFP_KERNEL, newsize);
+               nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
                 if (!nsh)
                         break;
  
-               nsh->raid_conf = conf;
                 list_add(&nsh->lru, &newstripes);
         }
         if (i) {
@@ -2258,7 +2343,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
                 while (!list_empty(&newstripes)) {
                         nsh = list_entry(newstripes.next, struct stripe_head, lru);
                         list_del(&nsh->lru);
-                       kmem_cache_free(sc, nsh);
+                       free_stripe(sc, nsh);
                 }
                 kmem_cache_destroy(sc);
                 mutex_unlock(&conf->cache_size_mutex);
@@ -2284,7 +2369,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
                         nsh->dev[i].orig_page = osh->dev[i].page;
                 }
                 nsh->hash_lock_index = hash;
-               kmem_cache_free(conf->slab_cache, osh);
+               free_stripe(conf->slab_cache, osh);
                 cnt++;
                 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
                     !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
@@ -2323,6 +2408,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
                 err = -ENOMEM;
  
         mutex_unlock(&conf->cache_size_mutex);
+
+       conf->slab_cache = sc;
+       conf->active_name = 1-conf->active_name;
+
         /* Step 4, return new stripes to service */
         while(!list_empty(&newstripes)) {
                 nsh = list_entry(newstripes.next, struct stripe_head, lru);
@@ -2340,8 +2429,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
         }
         /* critical section pass, GFP_NOIO no longer needed */
  
-       conf->slab_cache = sc;
-       conf->active_name = 1-conf->active_name;
         if (!err)
                 conf->pool_size = newsize;
         return err;
@@ -2359,7 +2446,7 @@ static int drop_one_stripe(struct r5conf *conf)
                 return 0;
         BUG_ON(atomic_read(&sh->count));
         shrink_buffers(sh);
-       kmem_cache_free(conf->slab_cache, sh);
+       free_stripe(conf->slab_cache, sh);
         atomic_dec(&conf->active_stripes);
         conf->max_nr_stripes--;
         return 1;
@@ -3082,6 +3169,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
                 s->locked++;
         }
  
+       if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
+           test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
+           !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
+           test_bit(R5_Insync, &sh->dev[pd_idx].flags))
+               set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
+
         pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
                 __func__, (unsigned long long)sh->sector,
                 s->locked, s->ops_request);
@@ -3103,14 +3196,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
                 (unsigned long long)bi->bi_iter.bi_sector,
                 (unsigned long long)sh->sector);
  
-       /*
-        * If several bio share a stripe. The bio bi_phys_segments acts as a
-        * reference count to avoid race. The reference count should already be
-        * increased before this function is called (for example, in
-        * raid5_make_request()), so other bio sharing this stripe will not free the
-        * stripe. If a stripe is owned by one stripe, the stripe lock will
-        * protect it.
-        */
         spin_lock_irq(&sh->stripe_lock);
         /* Don't allow new IO added to stripes in batch list */
         if (sh->batch_head)
@@ -3129,6 +3214,36 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
         if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
                 goto overlap;
  
+       if (forwrite && raid5_has_ppl(conf)) {
+               /*
+                * With PPL only writes to consecutive data chunks within a
+                * stripe are allowed because for a single stripe_head we can
+                * only have one PPL entry at a time, which describes one data
+                * range. Not really an overlap, but wait_for_overlap can be
+                * used to handle this.
+                */
+               sector_t sector;
+               sector_t first = 0;
+               sector_t last = 0;
+               int count = 0;
+               int i;
+
+               for (i = 0; i < sh->disks; i++) {
+                       if (i != sh->pd_idx &&
+                           (i == dd_idx || sh->dev[i].towrite)) {
+                               sector = sh->dev[i].sector;
+                               if (count == 0 || sector < first)
+                                       first = sector;
+                               if (sector > last)
+                                       last = sector;
+                               count++;
+                       }
+               }
+
+               if (first + conf->chunk_sectors * (count - 1) != last)
+                       goto overlap;
+       }
+
         if (!forwrite || previous)
                 clear_bit(STRIPE_BATCH_READY, &sh->state);
  
@@ -3136,7 +3251,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
         if (*bip)
                 bi->bi_next = *bip;
         *bip = bi;
-       raid5_inc_bi_active_stripes(bi);
+       bio_inc_remaining(bi);
+       md_write_inc(conf->mddev, bi);
  
         if (forwrite) {
                 /* check if page is covered */
@@ -3213,8 +3329,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
  
  static void
  handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
-                               struct stripe_head_state *s, int disks,
-                               struct bio_list *return_bi)
+                    struct stripe_head_state *s, int disks)
  {
         int i;
         BUG_ON(sh->batch_head);
@@ -3250,7 +3365,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                 if (bi)
                         bitmap_end = 1;
  
-               r5l_stripe_write_finished(sh);
+               log_stripe_write_finished(sh);
  
                 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
                         wake_up(&conf->wait_for_overlap);
@@ -3260,10 +3375,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                         struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
  
                         bi->bi_error = -EIO;
-                       if (!raid5_dec_bi_active_stripes(bi)) {
-                               md_write_end(conf->mddev);
-                               bio_list_add(return_bi, bi);
-                       }
+                       md_write_end(conf->mddev);
+                       bio_endio(bi);
                         bi = nextbi;
                 }
                 if (bitmap_end)
@@ -3284,10 +3397,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                         struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
  
                         bi->bi_error = -EIO;
-                       if (!raid5_dec_bi_active_stripes(bi)) {
-                               md_write_end(conf->mddev);
-                               bio_list_add(return_bi, bi);
-                       }
+                       md_write_end(conf->mddev);
+                       bio_endio(bi);
                         bi = bi2;
                 }
  
@@ -3312,8 +3423,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                                         r5_next_bio(bi, sh->dev[i].sector);
  
                                 bi->bi_error = -EIO;
-                               if (!raid5_dec_bi_active_stripes(bi))
-                                       bio_list_add(return_bi, bi);
+                               bio_endio(bi);
                                 bi = nextbi;
                         }
                 }
@@ -3449,7 +3559,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
             !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                 /* Pre-reads at not permitted until after short delay
                  * to gather multiple requests.  However if this
-                * device is no Insync, the block could only be be computed
+                * device is no Insync, the block could only be computed
                  * and there is no need to delay that.
                  */
                 return 0;
@@ -3468,7 +3578,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
  
         /* If we are forced to do a reconstruct-write, either because
          * the current RAID6 implementation only supports that, or
-        * or because parity cannot be trusted and we are currently
+        * because parity cannot be trusted and we are currently
          * recovering it, there is extra need to be careful.
          * If one of the devices that we would need to read, because
          * it is not being overwritten (and maybe not written at all)
@@ -3508,9 +3618,20 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
                 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
                 BUG_ON(test_bit(R5_Wantread, &dev->flags));
                 BUG_ON(sh->batch_head);
+
+               /*
+                * In the raid6 case if the only non-uptodate disk is P
+                * then we already trusted P to compute the other failed
+                * drives. It is safe to compute rather than re-read P.
+                * In other cases we only compute blocks from failed
+                * devices, otherwise check/repair might fail to detect
+                * a real inconsistency.
+                */
+
                 if ((s->uptodate == disks - 1) &&
+                   ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
                     (s->failed && (disk_idx == s->failed_num[0] ||
-                                  disk_idx == s->failed_num[1]))) {
+                                  disk_idx == s->failed_num[1])))) {
                         /* have disk failed, and we're requested to fetch it;
                          * do compute it
                          */
@@ -3612,7 +3733,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
   * never LOCKED, so we don't need to test 'failed' directly.
   */
  static void handle_stripe_clean_event(struct r5conf *conf,
-       struct stripe_head *sh, int disks, struct bio_list *return_bi)
+       struct stripe_head *sh, int disks)
  {
         int i;
         struct r5dev *dev;
@@ -3644,10 +3765,8 @@ returnbi:
                                 while (wbi && wbi->bi_iter.bi_sector <
                                         dev->sector + STRIPE_SECTORS) {
                                         wbi2 = r5_next_bio(wbi, dev->sector);
-                                       if (!raid5_dec_bi_active_stripes(wbi)) {
-                                               md_write_end(conf->mddev);
-                                               bio_list_add(return_bi, wbi);
-                                       }
+                                       md_write_end(conf->mddev);
+                                       bio_endio(wbi);
                                         wbi = wbi2;
                                 }
                                 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -3669,7 +3788,7 @@ returnbi:
                                 discard_pending = 1;
                 }
  
-       r5l_stripe_write_finished(sh);
+       log_stripe_write_finished(sh);
  
         if (!discard_pending &&
             test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
@@ -4556,7 +4675,8 @@ static void handle_stripe(struct stripe_head *sh)
         if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
                 goto finish;
  
-       if (s.handle_bad_blocks) {
+       if (s.handle_bad_blocks ||
+           test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
                 set_bit(STRIPE_HANDLE, &sh->state);
                 goto finish;
         }
@@ -4589,7 +4709,7 @@ static void handle_stripe(struct stripe_head *sh)
                 sh->reconstruct_state = 0;
                 break_stripe_batch_list(sh, 0);
                 if (s.to_read+s.to_write+s.written)
-                       handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
+                       handle_failed_stripe(conf, sh, &s, disks);
                 if (s.syncing + s.replacing)
                         handle_failed_sync(conf, sh, &s);
         }
@@ -4655,11 +4775,11 @@ static void handle_stripe(struct stripe_head *sh)
                              && !test_bit(R5_LOCKED, &qdev->flags)
                              && (test_bit(R5_UPTODATE, &qdev->flags) ||
                                  test_bit(R5_Discard, &qdev->flags))))))
-               handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+               handle_stripe_clean_event(conf, sh, disks);
  
         if (s.just_cached)
-               r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
-       r5l_stripe_write_finished(sh);
+               r5c_handle_cached_data_endio(conf, sh, disks);
+       log_stripe_write_finished(sh);
  
         /* Now we might consider reading some blocks, either to check/generate
          * parity, or to satisfy requests
@@ -4886,16 +5006,6 @@ finish:
                         md_wakeup_thread(conf->mddev->thread);
         }
  
-       if (!bio_list_empty(&s.return_bi)) {
-               if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
-                       spin_lock_irq(&conf->device_lock);
-                       bio_list_merge(&conf->return_bi, &s.return_bi);
-                       spin_unlock_irq(&conf->device_lock);
-                       md_wakeup_thread(conf->mddev->thread);
-               } else
-                       return_io(&s.return_bi);
-       }
-
         clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
  }
  
@@ -4984,12 +5094,14 @@ static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
         md_wakeup_thread(conf->mddev->thread);
  }
  
-static struct bio *remove_bio_from_retry(struct r5conf *conf)
+static struct bio *remove_bio_from_retry(struct r5conf *conf,
+                                        unsigned int *offset)
  {
         struct bio *bi;
  
         bi = conf->retry_read_aligned;
         if (bi) {
+               *offset = conf->retry_read_offset;
                 conf->retry_read_aligned = NULL;
                 return bi;
         }
@@ -4997,11 +5109,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
         if(bi) {
                 conf->retry_read_aligned_list = bi->bi_next;
                 bi->bi_next = NULL;
-               /*
-                * this sets the active strip count to 1 and the processed
-                * strip count to zero (upper 8 bits)
-                */
-               raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
+               *offset = 0;
         }
  
         return bi;
@@ -5136,24 +5244,20 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
  static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
  {
         struct bio *split;
+       sector_t sector = raid_bio->bi_iter.bi_sector;
+       unsigned chunk_sects = mddev->chunk_sectors;
+       unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
  
-       do {
-               sector_t sector = raid_bio->bi_iter.bi_sector;
-               unsigned chunk_sects = mddev->chunk_sectors;
-               unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
-
-               if (sectors < bio_sectors(raid_bio)) {
-                       split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
-                       bio_chain(split, raid_bio);
-               } else
-                       split = raid_bio;
+       if (sectors < bio_sectors(raid_bio)) {
+               struct r5conf *conf = mddev->private;
+               split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split);
+               bio_chain(split, raid_bio);
+               generic_make_request(raid_bio);
+               raid_bio = split;
+       }
  
-               if (!raid5_read_one_chunk(mddev, split)) {
-                       if (split != raid_bio)
-                               generic_make_request(raid_bio);
-                       return split;
-               }
-       } while (split != raid_bio);
+       if (!raid5_read_one_chunk(mddev, raid_bio))
+               return raid_bio;
  
         return NULL;
  }
@@ -5170,19 +5274,27 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
   */
  static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
  {
-       struct stripe_head *sh = NULL, *tmp;
+       struct stripe_head *sh, *tmp;
         struct list_head *handle_list = NULL;
-       struct r5worker_group *wg = NULL;
+       struct r5worker_group *wg;
+       bool second_try = !r5c_is_writeback(conf->log);
+       bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state);
  
+again:
+       wg = NULL;
+       sh = NULL;
         if (conf->worker_cnt_per_group == 0) {
-               handle_list = &conf->handle_list;
+               handle_list = try_loprio ? &conf->loprio_list :
+                                       &conf->handle_list;
         } else if (group != ANY_GROUP) {
-               handle_list = &conf->worker_groups[group].handle_list;
+               handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
+                               &conf->worker_groups[group].handle_list;
                 wg = &conf->worker_groups[group];
         } else {
                 int i;
                 for (i = 0; i < conf->group_cnt; i++) {
-                       handle_list = &conf->worker_groups[i].handle_list;
+                       handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
+                               &conf->worker_groups[i].handle_list;
                         wg = &conf->worker_groups[i];
                         if (!list_empty(handle_list))
                                 break;
@@ -5233,8 +5345,13 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
                 wg = NULL;
         }
  
-       if (!sh)
-               return NULL;
+       if (!sh) {
+               if (second_try)
+                       return NULL;
+               second_try = true;
+               try_loprio = !try_loprio;
+               goto again;
+       }
  
         if (wg) {
                 wg->stripes_cnt--;
@@ -5323,7 +5440,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
         struct r5conf *conf = mddev->private;
         sector_t logical_sector, last_sector;
         struct stripe_head *sh;
-       int remaining;
         int stripe_sectors;
  
         if (mddev->reshape_position != MaxSector)
@@ -5334,7 +5450,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
         last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
  
         bi->bi_next = NULL;
-       bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+       md_write_start(mddev, bi);
  
         stripe_sectors = conf->chunk_sectors *
                 (conf->raid_disks - conf->max_degraded);
@@ -5380,7 +5496,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
                                 continue;
                         sh->dev[d].towrite = bi;
                         set_bit(R5_OVERWRITE, &sh->dev[d].flags);
-                       raid5_inc_bi_active_stripes(bi);
+                       bio_inc_remaining(bi);
+                       md_write_inc(mddev, bi);
                         sh->overwrite_disks++;
                 }
                 spin_unlock_irq(&sh->stripe_lock);
@@ -5403,11 +5520,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
                 release_stripe_plug(mddev, sh);
         }
  
-       remaining = raid5_dec_bi_active_stripes(bi);
-       if (remaining == 0) {
-               md_write_end(mddev);
-               bio_endio(bi);
-       }
+       md_write_end(mddev);
+       bio_endio(bi);
  }
  
  static void raid5_make_request(struct mddev *mddev, struct bio * bi)
@@ -5418,7 +5532,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
         sector_t logical_sector, last_sector;
         struct stripe_head *sh;
         const int rw = bio_data_dir(bi);
-       int remaining;
         DEFINE_WAIT(w);
         bool do_prepare;
         bool do_flush = false;
@@ -5440,8 +5553,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
                 do_flush = bi->bi_opf & REQ_PREFLUSH;
         }
  
-       md_write_start(mddev, bi);
-
         /*
          * If array is degraded, better not do chunk aligned read because
          * later we might have to read it again in order to reconstruct
@@ -5462,7 +5573,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
         logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
         last_sector = bio_end_sector(bi);
         bi->bi_next = NULL;
-       bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
+       md_write_start(mddev, bi);
  
         prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
         for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
@@ -5597,16 +5708,9 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
         }
         finish_wait(&conf->wait_for_overlap, &w);
  
-       remaining = raid5_dec_bi_active_stripes(bi);
-       if (remaining == 0) {
-
-               if ( rw == WRITE )
-                       md_write_end(mddev);
-
-               trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
-                                        bi, 0);
-               bio_endio(bi);
-       }
+       if (rw == WRITE)
+               md_write_end(mddev);
+       bio_endio(bi);
  }
  
  static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
@@ -5955,7 +6059,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
         return STRIPE_SECTORS;
  }
  
-static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
+static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
+                              unsigned int offset)
  {
         /* We may not be able to submit a whole bio at once as there
          * may not be enough stripe_heads available.
@@ -5971,7 +6076,6 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
         int dd_idx;
         sector_t sector, logical_sector, last_sector;
         int scnt = 0;
-       int remaining;
         int handled = 0;
  
         logical_sector = raid_bio->bi_iter.bi_sector &
@@ -5985,7 +6089,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
                      sector += STRIPE_SECTORS,
                      scnt++) {
  
-               if (scnt < raid5_bi_processed_stripes(raid_bio))
+               if (scnt < offset)
                         /* already done this stripe */
                         continue;
  
@@ -5993,15 +6097,15 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
  
                 if (!sh) {
                         /* failed to get a stripe - must wait */
-                       raid5_set_bi_processed_stripes(raid_bio, scnt);
                         conf->retry_read_aligned = raid_bio;
+                       conf->retry_read_offset = scnt;
                         return handled;
                 }
  
                 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
                         raid5_release_stripe(sh);
-                       raid5_set_bi_processed_stripes(raid_bio, scnt);
                         conf->retry_read_aligned = raid_bio;
+                       conf->retry_read_offset = scnt;
                         return handled;
                 }
  
@@ -6010,12 +6114,9 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
                 raid5_release_stripe(sh);
                 handled++;
         }
-       remaining = raid5_dec_bi_active_stripes(raid_bio);
-       if (remaining == 0) {
-               trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
-                                        raid_bio, 0);
-               bio_endio(raid_bio);
-       }
+
+       bio_endio(raid_bio);
+
         if (atomic_dec_and_test(&conf->active_aligned_reads))
                 wake_up(&conf->wait_for_quiescent);
         return handled;
@@ -6058,7 +6159,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
  
         for (i = 0; i < batch_size; i++)
                 handle_stripe(batch[i]);
-       r5l_write_stripe_run(conf->log);
+       log_write_stripe_run(conf);
  
         cond_resched();
  
@@ -6075,6 +6176,7 @@ static void raid5_do_work(struct work_struct *work)
         struct r5worker *worker = container_of(work, struct r5worker, work);
         struct r5worker_group *group = worker->group;
         struct r5conf *conf = group->conf;
+       struct mddev *mddev = conf->mddev;
         int group_id = group - conf->worker_groups;
         int handled;
         struct blk_plug plug;
@@ -6095,6 +6197,9 @@ static void raid5_do_work(struct work_struct *work)
                 if (!batch_size && !released)
                         break;
                 handled += batch_size;
+               wait_event_lock_irq(mddev->sb_wait,
+                       !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
+                       conf->device_lock);
         }
         pr_debug("%d stripes handled\n", handled);
  
@@ -6122,24 +6227,13 @@ static void raid5d(struct md_thread *thread)
  
         md_check_recovery(mddev);
  
-       if (!bio_list_empty(&conf->return_bi) &&
-           !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-               struct bio_list tmp = BIO_EMPTY_LIST;
-               spin_lock_irq(&conf->device_lock);
-               if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-                       bio_list_merge(&tmp, &conf->return_bi);
-                       bio_list_init(&conf->return_bi);
-               }
-               spin_unlock_irq(&conf->device_lock);
-               return_io(&tmp);
-       }
-
         blk_start_plug(&plug);
         handled = 0;
         spin_lock_irq(&conf->device_lock);
         while (1) {
                 struct bio *bio;
                 int batch_size, released;
+               unsigned int offset;
  
                 released = release_stripe_list(conf, conf->temp_inactive_list);
                 if (released)
@@ -6157,10 +6251,10 @@ static void raid5d(struct md_thread *thread)
                 }
                 raid5_activate_delayed(conf);
  
-               while ((bio = remove_bio_from_retry(conf))) {
+               while ((bio = remove_bio_from_retry(conf, &offset))) {
                         int ok;
                         spin_unlock_irq(&conf->device_lock);
-                       ok = retry_aligned_read(conf, bio);
+                       ok = retry_aligned_read(conf, bio, offset);
                         spin_lock_irq(&conf->device_lock);
                         if (!ok)
                                 break;
@@ -6544,6 +6638,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
  
                 group = &(*worker_groups)[i];
                 INIT_LIST_HEAD(&group->handle_list);
+               INIT_LIST_HEAD(&group->loprio_list);
                 group->conf = conf;
                 group->workers = workers + i * cnt;
  
@@ -6634,8 +6729,8 @@ static void free_conf(struct r5conf *conf)
  {
         int i;
  
-       if (conf->log)
-               r5l_exit_log(conf->log);
+       log_exit(conf);
+
         if (conf->shrinker.nr_deferred)
                 unregister_shrinker(&conf->shrinker);
  
@@ -6646,7 +6741,10 @@ static void free_conf(struct r5conf *conf)
                 if (conf->disks[i].extra_page)
                         put_page(conf->disks[i].extra_page);
         kfree(conf->disks);
+       if (conf->bio_split)
+               bioset_free(conf->bio_split);
         kfree(conf->stripe_hashtbl);
+       kfree(conf->pending_data);
         kfree(conf);
  }
  
@@ -6756,6 +6854,14 @@ static struct r5conf *setup_conf(struct mddev *mddev)
         conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
         if (conf == NULL)
                 goto abort;
+       INIT_LIST_HEAD(&conf->free_list);
+       INIT_LIST_HEAD(&conf->pending_list);
+       conf->pending_data = kzalloc(sizeof(struct r5pending_data) *
+               PENDING_IO_MAX, GFP_KERNEL);
+       if (!conf->pending_data)
+               goto abort;
+       for (i = 0; i < PENDING_IO_MAX; i++)
+               list_add(&conf->pending_data[i].sibling, &conf->free_list);
         /* Don't enable multi-threading by default*/
         if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
                                  &new_group)) {
@@ -6771,15 +6877,14 @@ static struct r5conf *setup_conf(struct mddev *mddev)
         init_waitqueue_head(&conf->wait_for_stripe);
         init_waitqueue_head(&conf->wait_for_overlap);
         INIT_LIST_HEAD(&conf->handle_list);
+       INIT_LIST_HEAD(&conf->loprio_list);
         INIT_LIST_HEAD(&conf->hold_list);
         INIT_LIST_HEAD(&conf->delayed_list);
         INIT_LIST_HEAD(&conf->bitmap_list);
-       bio_list_init(&conf->return_bi);
         init_llist_head(&conf->released_stripes);
         atomic_set(&conf->active_stripes, 0);
         atomic_set(&conf->preread_active_stripes, 0);
         atomic_set(&conf->active_aligned_reads, 0);
-       bio_list_init(&conf->pending_bios);
         spin_lock_init(&conf->pending_bios_lock);
         conf->batch_bio_dispatch = true;
         rdev_for_each(rdev, mddev) {
@@ -6813,6 +6918,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
                         goto abort;
         }
  
+       conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+       if (!conf->bio_split)
+               goto abort;
         conf->mddev = mddev;
  
         if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
@@ -7097,6 +7205,13 @@ static int raid5_run(struct mddev *mddev)
                 BUG_ON(mddev->delta_disks != 0);
         }
  
+       if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
+           test_bit(MD_HAS_PPL, &mddev->flags)) {
+               pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
+                       mdname(mddev));
+               clear_bit(MD_HAS_PPL, &mddev->flags);
+       }
+
         if (mddev->private == NULL)
                 conf = setup_conf(mddev);
         else
@@ -7188,7 +7303,10 @@ static int raid5_run(struct mddev *mddev)
  
         if (mddev->degraded > dirty_parity_disks &&
             mddev->recovery_cp != MaxSector) {
-               if (mddev->ok_start_degraded)
+               if (test_bit(MD_HAS_PPL, &mddev->flags))
+                       pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
+                               mdname(mddev));
+               else if (mddev->ok_start_degraded)
                         pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
                                 mdname(mddev));
                 else {
@@ -7254,14 +7372,6 @@ static int raid5_run(struct mddev *mddev)
                 mddev->queue->limits.discard_alignment = stripe;
                 mddev->queue->limits.discard_granularity = stripe;
  
-               /*
-                * We use 16-bit counter of active stripes in bi_phys_segments
-                * (minus one for over-loaded initialization)
-                */
-               blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
-               blk_queue_max_discard_sectors(mddev->queue,
-                                             0xfffe * STRIPE_SECTORS);
-
                 blk_queue_max_write_same_sectors(mddev->queue, 0);
                 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
  
@@ -7299,14 +7409,8 @@ static int raid5_run(struct mddev *mddev)
                 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
         }
  
-       if (journal_dev) {
-               char b[BDEVNAME_SIZE];
-
-               pr_debug("md/raid:%s: using device %s as journal\n",
-                        mdname(mddev), bdevname(journal_dev->bdev, b));
-               if (r5l_init_log(conf, journal_dev))
-                       goto abort;
-       }
+       if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
+               goto abort;
  
         return 0;
  abort:
@@ -7420,17 +7524,16 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
  
         print_raid5_conf(conf);
         if (test_bit(Journal, &rdev->flags) && conf->log) {
-               struct r5l_log *log;
                 /*
                  * we can't wait pending write here, as this is called in
                  * raid5d, wait will deadlock.
+                * neilb: there is no locking about new writes here,
+                * so this cannot be safe.
                  */
-               if (atomic_read(&mddev->writes_pending))
+               if (atomic_read(&conf->active_stripes)) {
                         return -EBUSY;
-               log = conf->log;
-               conf->log = NULL;
-               synchronize_rcu();
-               r5l_exit_log(log);
+               }
+               log_exit(conf);
                 return 0;
         }
         if (rdev == p->rdev)
@@ -7469,6 +7572,11 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
                         *rdevp = rdev;
                 }
         }
+       if (!err) {
+               err = log_modify(conf, rdev, false);
+               if (err)
+                       goto abort;
+       }
         if (p->replacement) {
                 /* We must have just cleared 'rdev' */
                 p->rdev = p->replacement;
@@ -7477,12 +7585,12 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
                            * but will never see neither - if they are careful
                            */
                 p->replacement = NULL;
-               clear_bit(WantReplacement, &rdev->flags);
-       } else
-               /* We might have just removed the Replacement as faulty-
-                * clear the bit just in case
-                */
-               clear_bit(WantReplacement, &rdev->flags);
+
+               if (!err)
+                       err = log_modify(conf, p->rdev, true);
+       }
+
+       clear_bit(WantReplacement, &rdev->flags);
  abort:
  
         print_raid5_conf(conf);
@@ -7499,7 +7607,6 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
         int last = conf->raid_disks - 1;
  
         if (test_bit(Journal, &rdev->flags)) {
-               char b[BDEVNAME_SIZE];
                 if (conf->log)
                         return -EBUSY;
  
@@ -7508,9 +7615,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
                  * The array is in readonly mode if journal is missing, so no
                  * write requests running. We should be safe
                  */
-               r5l_init_log(conf, rdev);
-               pr_debug("md/raid:%s: using device %s as journal\n",
-                        mdname(mddev), bdevname(rdev->bdev, b));
+               log_init(conf, rdev, false);
                 return 0;
         }
         if (mddev->recovery_disabled == conf->recovery_disabled)
@@ -7537,10 +7642,12 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
                 if (p->rdev == NULL) {
                         clear_bit(In_sync, &rdev->flags);
                         rdev->raid_disk = disk;
-                       err = 0;
                         if (rdev->saved_raid_disk != disk)
                                 conf->fullsync = 1;
                         rcu_assign_pointer(p->rdev, rdev);
+
+                       err = log_modify(conf, rdev, true);
+
                         goto out;
                 }
         }
@@ -7574,7 +7681,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
         sector_t newsize;
         struct r5conf *conf = mddev->private;
  
-       if (conf->log)
+       if (conf->log || raid5_has_ppl(conf))
                 return -EINVAL;
         sectors &= ~((sector_t)conf->chunk_sectors - 1);
         newsize = raid5_size(mddev, sectors, mddev->raid_disks);
@@ -7625,7 +7732,7 @@ static int check_reshape(struct mddev *mddev)
  {
         struct r5conf *conf = mddev->private;
  
-       if (conf->log)
+       if (conf->log || raid5_has_ppl(conf))
                 return -EINVAL;
         if (mddev->delta_disks == 0 &&
             mddev->new_layout == mddev->layout &&
@@ -7658,6 +7765,9 @@ static int check_reshape(struct mddev *mddev)
                                       mddev->chunk_sectors)
                             ) < 0)
                         return -ENOMEM;
+
+       if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
+               return 0; /* never bother to shrink */
         return resize_stripes(conf, (conf->previous_raid_disks
                                      + mddev->delta_disks));
  }
@@ -8148,6 +8258,68 @@ static void *raid6_takeover(struct mddev *mddev)
         return setup_conf(mddev);
  }
  
+static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
+{
+       struct r5conf *conf;
+       int err;
+
+       err = mddev_lock(mddev);
+       if (err)
+               return err;
+       conf = mddev->private;
+       if (!conf) {
+               mddev_unlock(mddev);
+               return -ENODEV;
+       }
+
+       if (strncmp(buf, "ppl", 3) == 0) {
+               /* ppl only works with RAID 5 */
+               if (!raid5_has_ppl(conf) && conf->level == 5) {
+                       err = log_init(conf, NULL, true);
+                       if (!err) {
+                               err = resize_stripes(conf, conf->pool_size);
+                               if (err)
+                                       log_exit(conf);
+                       }
+               } else
+                       err = -EINVAL;
+       } else if (strncmp(buf, "resync", 6) == 0) {
+               if (raid5_has_ppl(conf)) {
+                       mddev_suspend(mddev);
+                       log_exit(conf);
+                       mddev_resume(mddev);
+                       err = resize_stripes(conf, conf->pool_size);
+               } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
+                          r5l_log_disk_error(conf)) {
+                       bool journal_dev_exists = false;
+                       struct md_rdev *rdev;
+
+                       rdev_for_each(rdev, mddev)
+                               if (test_bit(Journal, &rdev->flags)) {
+                                       journal_dev_exists = true;
+                                       break;
+                               }
+
+                       if (!journal_dev_exists) {
+                               mddev_suspend(mddev);
+                               clear_bit(MD_HAS_JOURNAL, &mddev->flags);
+                               mddev_resume(mddev);
+                       } else  /* need remove journal device first */
+                               err = -EBUSY;
+               } else
+                       err = -EINVAL;
+       } else {
+               err = -EINVAL;
+       }
+
+       if (!err)
+               md_update_sb(mddev, 1);
+
+       mddev_unlock(mddev);
+
+       return err;
+}
+
  static struct md_personality raid6_personality =
  {
         .name           = "raid6",
@@ -8170,6 +8342,7 @@ static struct md_personality raid6_personality =
         .quiesce        = raid5_quiesce,
         .takeover       = raid6_takeover,
         .congested      = raid5_congested,
+       .change_consistency_policy = raid5_change_consistency_policy,
  };
  static struct md_personality raid5_personality =
  {
@@ -8193,6 +8366,7 @@ static struct md_personality raid5_personality =
         .quiesce        = raid5_quiesce,
         .takeover       = raid5_takeover,
         .congested      = raid5_congested,
+       .change_consistency_policy = raid5_change_consistency_policy,
  };
  
  static struct md_personality raid4_personality =
@@ -8217,6 +8391,7 @@ static struct md_personality raid4_personality =
         .quiesce        = raid5_quiesce,
         .takeover       = raid4_takeover,
         .congested      = raid5_congested,
+       .change_consistency_policy = raid5_change_consistency_policy,
  };
  
  static int __init raid5_init(void)