Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 1 Mar 2018 18:08:47 +0000 (10:08 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 1 Mar 2018 18:08:47 +0000 (10:08 -0800)
Pull MD bugfixes from Shaohua Li:

 - fix raid5-ppl flush request handling hang from Artur

 - fix a potential deadlock in raid5/10 reshape from BingJing

 - fix a deadlock for dm-raid from Heinz

 - fix two md-cluster of raid10 from Lidong and Guoqing

 - fix a NULL deference problem in device removal from Neil

 - fix a NULL deference problem in raid1/raid10 in specific condition
   from Yufen

 - other cleanup and fixes

* 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md:
  md/raid1: fix NULL pointer dereference
  md: fix a potential deadlock of raid5/raid10 reshape
  md-cluster: choose correct label when clustered layout is not supported
  md: raid5: avoid string overflow warning
  raid5-ppl: fix handling flush requests
  md raid10: fix NULL deference in handle_write_completed()
  md: only allow remove_and_add_spares when no sync_thread running.
  md: document lifetime of internal rdev pointer.
  md: fix md_write_start() deadlock w/o metadata devices
  MD: Free bioset when md_run fails
  raid10: change the size of resync window for clustered raid
  md-multipath: Use seq_putc() in multipath_status()
  md/raid1: Fix trailing semicolon
  md/raid5: simplify uninitialization of shrinker

drivers/md/md-multipath.c
drivers/md/md.c
drivers/md/md.h
drivers/md/raid1.c
drivers/md/raid1.h
drivers/md/raid10.c
drivers/md/raid10.h
drivers/md/raid5-log.h
drivers/md/raid5-ppl.c
drivers/md/raid5.c
drivers/md/raid5.h

index e40065bdbfc84ef8455dbcc88d6a3208a6780571..0a7e99d62c69048d7000d3dc6bbcaa3320c742b5 100644 (file)
@@ -157,7 +157,7 @@ static void multipath_status(struct seq_file *seq, struct mddev *mddev)
                seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
        }
        rcu_read_unlock();
-       seq_printf (seq, "]");
+       seq_putc(seq, ']');
 }
 
 static int multipath_congested(struct mddev *mddev, int bits)
index bc67ab6844f02d540cf4ae3725ebdad13ee568b8..254e44e44668f5fff8cc2d95bdb3b682450a204f 100644 (file)
@@ -801,6 +801,9 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
        struct bio *bio;
        int ff = 0;
 
+       if (!page)
+               return;
+
        if (test_bit(Faulty, &rdev->flags))
                return;
 
@@ -5452,6 +5455,7 @@ int md_run(struct mddev *mddev)
         * the only valid external interface is through the md
         * device.
         */
+       mddev->has_superblocks = false;
        rdev_for_each(rdev, mddev) {
                if (test_bit(Faulty, &rdev->flags))
                        continue;
@@ -5465,6 +5469,9 @@ int md_run(struct mddev *mddev)
                                set_disk_ro(mddev->gendisk, 1);
                }
 
+               if (rdev->sb_page)
+                       mddev->has_superblocks = true;
+
                /* perform some consistency tests on the device.
                 * We don't want the data to overlap the metadata,
                 * Internal Bitmap issues have been handled elsewhere.
@@ -5497,8 +5504,10 @@ int md_run(struct mddev *mddev)
        }
        if (mddev->sync_set == NULL) {
                mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-               if (!mddev->sync_set)
-                       return -ENOMEM;
+               if (!mddev->sync_set) {
+                       err = -ENOMEM;
+                       goto abort;
+               }
        }
 
        spin_lock(&pers_lock);
@@ -5511,7 +5520,8 @@ int md_run(struct mddev *mddev)
                else
                        pr_warn("md: personality for level %s is not loaded!\n",
                                mddev->clevel);
-               return -EINVAL;
+               err = -EINVAL;
+               goto abort;
        }
        spin_unlock(&pers_lock);
        if (mddev->level != pers->level) {
@@ -5524,7 +5534,8 @@ int md_run(struct mddev *mddev)
            pers->start_reshape == NULL) {
                /* This personality cannot handle reshaping... */
                module_put(pers->owner);
-               return -EINVAL;
+               err = -EINVAL;
+               goto abort;
        }
 
        if (pers->sync_request) {
@@ -5593,7 +5604,7 @@ int md_run(struct mddev *mddev)
                mddev->private = NULL;
                module_put(pers->owner);
                bitmap_destroy(mddev);
-               return err;
+               goto abort;
        }
        if (mddev->queue) {
                bool nonrot = true;
@@ -5655,6 +5666,18 @@ int md_run(struct mddev *mddev)
        sysfs_notify_dirent_safe(mddev->sysfs_action);
        sysfs_notify(&mddev->kobj, NULL, "degraded");
        return 0;
+
+abort:
+       if (mddev->bio_set) {
+               bioset_free(mddev->bio_set);
+               mddev->bio_set = NULL;
+       }
+       if (mddev->sync_set) {
+               bioset_free(mddev->sync_set);
+               mddev->sync_set = NULL;
+       }
+
+       return err;
 }
 EXPORT_SYMBOL_GPL(md_run);
 
@@ -8049,6 +8072,7 @@ EXPORT_SYMBOL(md_done_sync);
 bool md_write_start(struct mddev *mddev, struct bio *bi)
 {
        int did_change = 0;
+
        if (bio_data_dir(bi) != WRITE)
                return true;
 
@@ -8081,6 +8105,8 @@ bool md_write_start(struct mddev *mddev, struct bio *bi)
        rcu_read_unlock();
        if (did_change)
                sysfs_notify_dirent_safe(mddev->sysfs_state);
+       if (!mddev->has_superblocks)
+               return true;
        wait_event(mddev->sb_wait,
                   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
                   mddev->suspended);
@@ -8543,6 +8569,19 @@ void md_do_sync(struct md_thread *thread)
        set_mask_bits(&mddev->sb_flags, 0,
                      BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
 
+       if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+                       !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+                       mddev->delta_disks > 0 &&
+                       mddev->pers->finish_reshape &&
+                       mddev->pers->size &&
+                       mddev->queue) {
+               mddev_lock_nointr(mddev);
+               md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
+               mddev_unlock(mddev);
+               set_capacity(mddev->gendisk, mddev->array_sectors);
+               revalidate_disk(mddev->gendisk);
+       }
+
        spin_lock(&mddev->lock);
        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
                /* We completed so min/max setting can be forgotten if used. */
@@ -8569,6 +8608,10 @@ static int remove_and_add_spares(struct mddev *mddev,
        int removed = 0;
        bool remove_some = false;
 
+       if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+               /* Mustn't remove devices when resync thread is running */
+               return 0;
+
        rdev_for_each(rdev, mddev) {
                if ((this == NULL || rdev == this) &&
                    rdev->raid_disk >= 0 &&
index 58cd20a5e85edb1db853661dcd749d99682a796c..fbc925cce8107019dcc4b2aa5310c7d100431a46 100644 (file)
@@ -468,6 +468,8 @@ struct mddev {
        void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
        struct md_cluster_info          *cluster_info;
        unsigned int                    good_device_nr; /* good device num within cluster raid */
+
+       bool    has_superblocks:1;
 };
 
 enum recovery_flags {
index f978eddc7a21c8bd442c9b4afafb275ce2c17a2c..fe872dc6712ed0c5c00caa60e5f152876f0b1025 100644 (file)
@@ -1809,6 +1809,17 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
                        struct md_rdev *repl =
                                conf->mirrors[conf->raid_disks + number].rdev;
                        freeze_array(conf, 0);
+                       if (atomic_read(&repl->nr_pending)) {
+                               /* It means that some queued IO of retry_list
+                                * hold repl. Thus, we cannot set replacement
+                                * as NULL, avoiding rdev NULL pointer
+                                * dereference in sync_request_write and
+                                * handle_write_finished.
+                                */
+                               err = -EBUSY;
+                               unfreeze_array(conf);
+                               goto abort;
+                       }
                        clear_bit(Replacement, &repl->flags);
                        p->rdev = repl;
                        conf->mirrors[conf->raid_disks + number].rdev = NULL;
index c7294e7557e038bf5248e8c8bf9ce84cb6afad58..eb84bc68e2fd4c31cc996bc4d399183cd852b8f2 100644 (file)
 #define BARRIER_BUCKETS_NR_BITS                (PAGE_SHIFT - ilog2(sizeof(atomic_t)))
 #define BARRIER_BUCKETS_NR             (1<<BARRIER_BUCKETS_NR_BITS)
 
+/* Note: raid1_info.rdev can be set to NULL asynchronously by raid1_remove_disk.
+ * There are three safe ways to access raid1_info.rdev.
+ * 1/ when holding mddev->reconfig_mutex
+ * 2/ when resync/recovery is known to be happening - i.e. in code that is
+ *    called as part of performing resync/recovery.
+ * 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer
+ *    and if it is non-NULL, increment rdev->nr_pending before dropping the
+ *    RCU lock.
+ * When .rdev is set to NULL, the nr_pending count checked again and if it has
+ * been incremented, the pointer is put back in .rdev.
+ */
+
 struct raid1_info {
        struct md_rdev  *rdev;
        sector_t        head_position;
index 99c9207899a777c615880f286a6f617ffc4a70bf..c5e6c60fc0d41b53a578874087ae87259e3ebcce 100644 (file)
@@ -141,7 +141,7 @@ static void r10bio_pool_free(void *r10_bio, void *data)
 #define RESYNC_WINDOW (1024*1024)
 /* maximum number of concurrent requests, memory permitting */
 #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
-#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
+#define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW)
 #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
 
 /*
@@ -2655,7 +2655,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
                for (m = 0; m < conf->copies; m++) {
                        int dev = r10_bio->devs[m].devnum;
                        rdev = conf->mirrors[dev].rdev;
-                       if (r10_bio->devs[m].bio == NULL)
+                       if (r10_bio->devs[m].bio == NULL ||
+                               r10_bio->devs[m].bio->bi_end_io == NULL)
                                continue;
                        if (!r10_bio->devs[m].bio->bi_status) {
                                rdev_clear_badblocks(
@@ -2670,7 +2671,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
                                        md_error(conf->mddev, rdev);
                        }
                        rdev = conf->mirrors[dev].replacement;
-                       if (r10_bio->devs[m].repl_bio == NULL)
+                       if (r10_bio->devs[m].repl_bio == NULL ||
+                               r10_bio->devs[m].repl_bio->bi_end_io == NULL)
                                continue;
 
                        if (!r10_bio->devs[m].repl_bio->bi_status) {
@@ -3782,7 +3784,7 @@ static int raid10_run(struct mddev *mddev)
                if (fc > 1 || fo > 0) {
                        pr_err("only near layout is supported by clustered"
                                " raid10\n");
-                       goto out;
+                       goto out_free_conf;
                }
        }
 
@@ -4830,17 +4832,11 @@ static void raid10_finish_reshape(struct mddev *mddev)
                return;
 
        if (mddev->delta_disks > 0) {
-               sector_t size = raid10_size(mddev, 0, 0);
-               md_set_array_sectors(mddev, size);
                if (mddev->recovery_cp > mddev->resync_max_sectors) {
                        mddev->recovery_cp = mddev->resync_max_sectors;
                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
                }
-               mddev->resync_max_sectors = size;
-               if (mddev->queue) {
-                       set_capacity(mddev->gendisk, mddev->array_sectors);
-                       revalidate_disk(mddev->gendisk);
-               }
+               mddev->resync_max_sectors = mddev->array_sectors;
        } else {
                int d;
                rcu_read_lock();
index db2ac22ac1b42801a14e5eab8641109a3aa61505..e2e8840de9bfab734e9238bc59ca8e9058c7de0b 100644 (file)
@@ -2,6 +2,19 @@
 #ifndef _RAID10_H
 #define _RAID10_H
 
+/* Note: raid10_info.rdev can be set to NULL asynchronously by
+ * raid10_remove_disk.
+ * There are three safe ways to access raid10_info.rdev.
+ * 1/ when holding mddev->reconfig_mutex
+ * 2/ when resync/recovery/reshape is known to be happening - i.e. in code
+ *    that is called as part of performing resync/recovery/reshape.
+ * 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer
+ *    and if it is non-NULL, increment rdev->nr_pending before dropping the
+ *    RCU lock.
+ * When .rdev is set to NULL, the nr_pending count checked again and if it has
+ * been incremented, the pointer is put back in .rdev.
+ */
+
 struct raid10_info {
        struct md_rdev  *rdev, *replacement;
        sector_t        head_position;
index 0c76bcedfc1cbd8b0af33008652095e3862f22c9..a001808a2b77da16bc2ae3c1d4aa32b5f944a939 100644 (file)
@@ -44,6 +44,7 @@ extern void ppl_write_stripe_run(struct r5conf *conf);
 extern void ppl_stripe_write_finished(struct stripe_head *sh);
 extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
 extern void ppl_quiesce(struct r5conf *conf, int quiesce);
+extern int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio);
 
 static inline bool raid5_has_ppl(struct r5conf *conf)
 {
@@ -104,7 +105,7 @@ static inline int log_handle_flush_request(struct r5conf *conf, struct bio *bio)
        if (conf->log)
                ret = r5l_handle_flush_request(conf->log, bio);
        else if (raid5_has_ppl(conf))
-               ret = 0;
+               ret = ppl_handle_flush_request(conf->log, bio);
 
        return ret;
 }
index 2764c2290062862a607dca9145ef8f0e24cb5d89..42890a08375bc73b6bcdba7961e9c88062dded5d 100644 (file)
@@ -693,6 +693,16 @@ void ppl_quiesce(struct r5conf *conf, int quiesce)
        }
 }
 
+int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio)
+{
+       if (bio->bi_iter.bi_size == 0) {
+               bio_endio(bio);
+               return 0;
+       }
+       bio->bi_opf &= ~REQ_PREFLUSH;
+       return -EAGAIN;
+}
+
 void ppl_stripe_write_finished(struct stripe_head *sh)
 {
        struct ppl_io_unit *io;
index 50d01144b80535e2e937c16021cdeeba632b201e..b5d2601483e34fa97fc8cfb7faeff6f014c0e035 100644 (file)
@@ -2196,15 +2196,16 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
 static int grow_stripes(struct r5conf *conf, int num)
 {
        struct kmem_cache *sc;
+       size_t namelen = sizeof(conf->cache_name[0]);
        int devs = max(conf->raid_disks, conf->previous_raid_disks);
 
        if (conf->mddev->gendisk)
-               sprintf(conf->cache_name[0],
+               snprintf(conf->cache_name[0], namelen,
                        "raid%d-%s", conf->level, mdname(conf->mddev));
        else
-               sprintf(conf->cache_name[0],
+               snprintf(conf->cache_name[0], namelen,
                        "raid%d-%p", conf->level, conf->mddev);
-       sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
+       snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
 
        conf->active_name = 0;
        sc = kmem_cache_create(conf->cache_name[conf->active_name],
@@ -6764,9 +6765,7 @@ static void free_conf(struct r5conf *conf)
 
        log_exit(conf);
 
-       if (conf->shrinker.nr_deferred)
-               unregister_shrinker(&conf->shrinker);
-
+       unregister_shrinker(&conf->shrinker);
        free_thread_groups(conf);
        shrink_stripes(conf);
        raid5_free_percpu(conf);
@@ -8001,13 +8000,7 @@ static void raid5_finish_reshape(struct mddev *mddev)
 
        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
 
-               if (mddev->delta_disks > 0) {
-                       md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
-                       if (mddev->queue) {
-                               set_capacity(mddev->gendisk, mddev->array_sectors);
-                               revalidate_disk(mddev->gendisk);
-                       }
-               } else {
+               if (mddev->delta_disks <= 0) {
                        int d;
                        spin_lock_irq(&conf->device_lock);
                        mddev->degraded = raid5_calc_degraded(conf);
index 2e6123825095296555e8a73a52d63872f2fa2bff..3f8da26032accce9fdf492b58fc120a94ac31582 100644 (file)
@@ -450,6 +450,18 @@ enum {
  * HANDLE gets cleared if stripe_handle leaves nothing locked.
  */
 
+/* Note: disk_info.rdev can be set to NULL asynchronously by raid5_remove_disk.
+ * There are three safe ways to access disk_info.rdev.
+ * 1/ when holding mddev->reconfig_mutex
+ * 2/ when resync/recovery/reshape is known to be happening - i.e. in code that
+ *    is called as part of performing resync/recovery/reshape.
+ * 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer
+ *    and if it is non-NULL, increment rdev->nr_pending before dropping the RCU
+ *    lock.
+ * When .rdev is set to NULL, the nr_pending count checked again and if
+ * it has been incremented, the pointer is put back in .rdev.
+ */
+
 struct disk_info {
        struct md_rdev  *rdev, *replacement;
        struct page     *extra_page; /* extra page to use in prexor */