md: Make flush bios explicitely sync
[sfrench/cifs-2.6.git] / drivers / md / raid5-cache.c
index 26ba09282e7c9691bdc352c5fb4b75b9f6e29821..0a7af8b0a80a031a99a7af1742e2d64e6df0d106 100644 (file)
@@ -24,6 +24,7 @@
 #include "md.h"
 #include "raid5.h"
 #include "bitmap.h"
+#include "raid5-log.h"
 
 /*
  * metadata/data stored in disk with 4k size unit (a block) regardless
@@ -622,20 +623,30 @@ static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
        __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
        spin_unlock_irqrestore(&log->io_list_lock, flags);
 
+       /*
+        * In case of journal device failures, submit_bio will get error
+        * and calls endio, then active stripes will continue write
+        * process. Therefore, it is not necessary to check Faulty bit
+        * of journal device here.
+        *
+        * We can't check split_bio after current_bio is submitted. If
+        * io->split_bio is null, after current_bio is submitted, current_bio
+        * might already be completed and the io_unit is freed. We submit
+        * split_bio first to avoid the issue.
+        */
+       if (io->split_bio) {
+               if (io->has_flush)
+                       io->split_bio->bi_opf |= REQ_PREFLUSH;
+               if (io->has_fua)
+                       io->split_bio->bi_opf |= REQ_FUA;
+               submit_bio(io->split_bio);
+       }
+
        if (io->has_flush)
                io->current_bio->bi_opf |= REQ_PREFLUSH;
        if (io->has_fua)
                io->current_bio->bi_opf |= REQ_FUA;
        submit_bio(io->current_bio);
-
-       if (!io->split_bio)
-               return;
-
-       if (io->has_flush)
-               io->split_bio->bi_opf |= REQ_PREFLUSH;
-       if (io->has_fua)
-               io->split_bio->bi_opf |= REQ_FUA;
-       submit_bio(io->split_bio);
 }
 
 /* deferred io_unit will be dispatched here */
@@ -670,6 +681,11 @@ static void r5c_disable_writeback_async(struct work_struct *work)
                return;
        pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
                mdname(mddev));
+
+       /* wait superblock change before suspend */
+       wait_event(mddev->sb_wait,
+                  !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
+
        mddev_suspend(mddev);
        log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
        mddev_resume(mddev);
@@ -1766,7 +1782,7 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
        mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
                                             mb, PAGE_SIZE));
        if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
-                         REQ_FUA, false)) {
+                         REQ_SYNC | REQ_FUA, false)) {
                __free_page(page);
                return -EIO;
        }
@@ -2372,7 +2388,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
                mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
                                                     mb, PAGE_SIZE));
                sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
-                            REQ_OP_WRITE, REQ_FUA, false);
+                            REQ_OP_WRITE, REQ_SYNC | REQ_FUA, false);
                sh->log_start = ctx->pos;
                list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
                atomic_inc(&log->stripe_in_journal_count);
@@ -2621,8 +2637,11 @@ int r5c_try_caching_write(struct r5conf *conf,
         * When run in degraded mode, array is set to write-through mode.
         * This check helps drain pending write safely in the transition to
         * write-through mode.
+        *
+        * When a stripe is syncing, the write is also handled in write
+        * through mode.
         */
-       if (s->failed) {
+       if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
                r5c_make_stripe_write_out(sh);
                return -EAGAIN;
        }
@@ -2825,6 +2844,9 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
        }
 
        r5l_append_flush_payload(log, sh->sector);
+       /* stripe is flused to raid disks, we can do resync now */
+       if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
+               set_bit(STRIPE_HANDLE, &sh->state);
 }
 
 int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
@@ -2973,7 +2995,7 @@ ioerr:
        return ret;
 }
 
-void r5c_update_on_rdev_error(struct mddev *mddev)
+void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
 {
        struct r5conf *conf = mddev->private;
        struct r5l_log *log = conf->log;
@@ -2981,7 +3003,8 @@ void r5c_update_on_rdev_error(struct mddev *mddev)
        if (!log)
                return;
 
-       if (raid5_calc_degraded(conf) > 0 &&
+       if ((raid5_calc_degraded(conf) > 0 ||
+            test_bit(Journal, &rdev->flags)) &&
            conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
                schedule_work(&log->disable_writeback_work);
 }