dm log userspace: allow mark requests to piggyback on flush requests
authorDongmao Zhang <dmzhang@suse.com>
Wed, 15 Jan 2014 21:44:37 +0000 (15:44 -0600)
committerMike Snitzer <snitzer@redhat.com>
Wed, 22 Jan 2014 04:46:27 +0000 (23:46 -0500)
In the cluster evironment, cluster write has poor performance because
userspace_flush() has to contact a userspace program (cmirrord) for
clear/mark/flush requests.  But both mark and flush requests require
cmirrord to communicate the message to all the cluster nodes for each
flush call.  This behaviour is really slow.

To address this we now merge mark and flush requests together to reduce
the kernel-userspace-kernel time.  We allow a new directive,
"integrated_flush" that can be used to instruct the kernel log code to
combine flush and mark requests when directed by userspace.  If not
directed by userspace (due to an older version of the userspace code
perhaps), the kernel will function as it did previously - preserving
backwards compatibility.  Additionally, flush requests are performed
lazily when only clear requests exist.

Signed-off-by: Dongmao Zhang <dmzhang@suse.com>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
drivers/md/dm-log-userspace-base.c
include/uapi/linux/dm-log-userspace.h

index 9429159d9ee335c78c57f19e023b68d602683acc..b953db6cc229ae65f30520c23918c2cf0abf4fd5 100644 (file)
 #include <linux/device-mapper.h>
 #include <linux/dm-log-userspace.h>
 #include <linux/module.h>
+#include <linux/workqueue.h>
 
 #include "dm-log-userspace-transfer.h"
 
-#define DM_LOG_USERSPACE_VSN "1.1.0"
+#define DM_LOG_USERSPACE_VSN "1.3.0"
 
 struct flush_entry {
        int type;
@@ -58,6 +59,18 @@ struct log_c {
        spinlock_t flush_lock;
        struct list_head mark_list;
        struct list_head clear_list;
+
+       /*
+        * Workqueue for flush of clear region requests.
+        */
+       struct workqueue_struct *dmlog_wq;
+       struct delayed_work flush_log_work;
+       atomic_t sched_flush;
+
+       /*
+        * Combine userspace flush and mark requests for efficiency.
+        */
+       uint32_t integrated_flush;
 };
 
 static mempool_t *flush_entry_pool;
@@ -122,6 +135,9 @@ static int build_constructor_string(struct dm_target *ti,
 
        *ctr_str = NULL;
 
+       /*
+        * Determine overall size of the string.
+        */
        for (i = 0, str_size = 0; i < argc; i++)
                str_size += strlen(argv[i]) + 1; /* +1 for space between args */
 
@@ -141,18 +157,39 @@ static int build_constructor_string(struct dm_target *ti,
        return str_size;
 }
 
+static void do_flush(struct work_struct *work)
+{
+       int r;
+       struct log_c *lc = container_of(work, struct log_c, flush_log_work.work);
+
+       atomic_set(&lc->sched_flush, 0);
+
+       r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, NULL, 0, NULL, NULL);
+
+       if (r)
+               dm_table_event(lc->ti->table);
+}
+
 /*
  * userspace_ctr
  *
  * argv contains:
- *     <UUID> <other args>
- * Where 'other args' is the userspace implementation specific log
- * arguments.  An example might be:
- *     <UUID> clustered-disk <arg count> <log dev> <region_size> [[no]sync]
+ *     <UUID> [integrated_flush] <other args>
+ * Where 'other args' are the userspace implementation-specific log
+ * arguments.
+ *
+ * Example:
+ *     <UUID> [integrated_flush] clustered-disk <arg count> <log dev>
+ *     <region_size> [[no]sync]
+ *
+ * This module strips off the <UUID> and uses it for identification
+ * purposes when communicating with userspace about a log.
  *
- * So, this module will strip off the <UUID> for identification purposes
- * when communicating with userspace about a log; but will pass on everything
- * else.
+ * If integrated_flush is defined, the kernel combines flush
+ * and mark requests.
+ *
+ * The rest of the line, beginning with 'clustered-disk', is passed
+ * to the userspace ctr function.
  */
 static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
                         unsigned argc, char **argv)
@@ -188,12 +225,22 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
                return -EINVAL;
        }
 
+       lc->usr_argc = argc;
+
        strncpy(lc->uuid, argv[0], DM_UUID_LEN);
+       argc--;
+       argv++;
        spin_lock_init(&lc->flush_lock);
        INIT_LIST_HEAD(&lc->mark_list);
        INIT_LIST_HEAD(&lc->clear_list);
 
-       str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
+       if (!strcasecmp(argv[0], "integrated_flush")) {
+               lc->integrated_flush = 1;
+               argc--;
+               argv++;
+       }
+
+       str_size = build_constructor_string(ti, argc, argv, &ctr_str);
        if (str_size < 0) {
                kfree(lc);
                return str_size;
@@ -246,6 +293,19 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
                        DMERR("Failed to register %s with device-mapper",
                              devices_rdata);
        }
+
+       if (lc->integrated_flush) {
+               lc->dmlog_wq = alloc_workqueue("dmlogd", WQ_MEM_RECLAIM, 0);
+               if (!lc->dmlog_wq) {
+                       DMERR("couldn't start dmlogd");
+                       r = -ENOMEM;
+                       goto out;
+               }
+
+               INIT_DELAYED_WORK(&lc->flush_log_work, do_flush);
+               atomic_set(&lc->sched_flush, 0);
+       }
+
 out:
        kfree(devices_rdata);
        if (r) {
@@ -253,7 +313,6 @@ out:
                kfree(ctr_str);
        } else {
                lc->usr_argv_str = ctr_str;
-               lc->usr_argc = argc;
                log->context = lc;
        }
 
@@ -264,9 +323,16 @@ static void userspace_dtr(struct dm_dirty_log *log)
 {
        struct log_c *lc = log->context;
 
+       if (lc->integrated_flush) {
+               /* flush workqueue */
+               if (atomic_read(&lc->sched_flush))
+                       flush_delayed_work(&lc->flush_log_work);
+
+               destroy_workqueue(lc->dmlog_wq);
+       }
+
        (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
-                                NULL, 0,
-                                NULL, NULL);
+                                   NULL, 0, NULL, NULL);
 
        if (lc->log_dev)
                dm_put_device(lc->ti, lc->log_dev);
@@ -283,8 +349,7 @@ static int userspace_presuspend(struct dm_dirty_log *log)
        struct log_c *lc = log->context;
 
        r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND,
-                                NULL, 0,
-                                NULL, NULL);
+                                NULL, 0, NULL, NULL);
 
        return r;
 }
@@ -294,9 +359,14 @@ static int userspace_postsuspend(struct dm_dirty_log *log)
        int r;
        struct log_c *lc = log->context;
 
+       /*
+        * Run planned flush earlier.
+        */
+       if (lc->integrated_flush && atomic_read(&lc->sched_flush))
+               flush_delayed_work(&lc->flush_log_work);
+
        r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND,
-                                NULL, 0,
-                                NULL, NULL);
+                                NULL, 0, NULL, NULL);
 
        return r;
 }
@@ -308,8 +378,7 @@ static int userspace_resume(struct dm_dirty_log *log)
 
        lc->in_sync_hint = 0;
        r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME,
-                                NULL, 0,
-                                NULL, NULL);
+                                NULL, 0, NULL, NULL);
 
        return r;
 }
@@ -405,7 +474,8 @@ static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
        return r;
 }
 
-static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
+static int flush_by_group(struct log_c *lc, struct list_head *flush_list,
+                         int flush_with_payload)
 {
        int r = 0;
        int count;
@@ -431,15 +501,29 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
                                break;
                }
 
-               r = userspace_do_request(lc, lc->uuid, type,
-                                        (char *)(group),
-                                        count * sizeof(uint64_t),
-                                        NULL, NULL);
-               if (r) {
-                       /* Group send failed.  Attempt one-by-one. */
-                       list_splice_init(&tmp_list, flush_list);
-                       r = flush_one_by_one(lc, flush_list);
-                       break;
+               if (flush_with_payload) {
+                       r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
+                                                (char *)(group),
+                                                count * sizeof(uint64_t),
+                                                NULL, NULL);
+                       /*
+                        * Integrated flush failed.
+                        */
+                       if (r)
+                               break;
+               } else {
+                       r = userspace_do_request(lc, lc->uuid, type,
+                                                (char *)(group),
+                                                count * sizeof(uint64_t),
+                                                NULL, NULL);
+                       if (r) {
+                               /*
+                                * Group send failed.  Attempt one-by-one.
+                                */
+                               list_splice_init(&tmp_list, flush_list);
+                               r = flush_one_by_one(lc, flush_list);
+                               break;
+                       }
                }
        }
 
@@ -476,6 +560,8 @@ static int userspace_flush(struct dm_dirty_log *log)
        struct log_c *lc = log->context;
        LIST_HEAD(mark_list);
        LIST_HEAD(clear_list);
+       int mark_list_is_empty;
+       int clear_list_is_empty;
        struct flush_entry *fe, *tmp_fe;
 
        spin_lock_irqsave(&lc->flush_lock, flags);
@@ -483,23 +569,51 @@ static int userspace_flush(struct dm_dirty_log *log)
        list_splice_init(&lc->clear_list, &clear_list);
        spin_unlock_irqrestore(&lc->flush_lock, flags);
 
-       if (list_empty(&mark_list) && list_empty(&clear_list))
+       mark_list_is_empty = list_empty(&mark_list);
+       clear_list_is_empty = list_empty(&clear_list);
+
+       if (mark_list_is_empty && clear_list_is_empty)
                return 0;
 
-       r = flush_by_group(lc, &mark_list);
+       r = flush_by_group(lc, &clear_list, 0);
        if (r)
-               goto fail;
+               goto out;
 
-       r = flush_by_group(lc, &clear_list);
+       if (!lc->integrated_flush) {
+               r = flush_by_group(lc, &mark_list, 0);
+               if (r)
+                       goto out;
+               r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
+                                        NULL, 0, NULL, NULL);
+               goto out;
+       }
+
+       /*
+        * Send integrated flush request with mark_list as payload.
+        */
+       r = flush_by_group(lc, &mark_list, 1);
        if (r)
-               goto fail;
+               goto out;
 
-       r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
-                                NULL, 0, NULL, NULL);
+       if (mark_list_is_empty && !atomic_read(&lc->sched_flush)) {
+               /*
+                * When there are only clear region requests,
+                * we schedule a flush in the future.
+                */
+               queue_delayed_work(lc->dmlog_wq, &lc->flush_log_work, 3 * HZ);
+               atomic_set(&lc->sched_flush, 1);
+       } else {
+               /*
+                * Cancel pending flush because we
+                * have already flushed in mark_region.
+                */
+               cancel_delayed_work(&lc->flush_log_work);
+               atomic_set(&lc->sched_flush, 0);
+       }
 
-fail:
+out:
        /*
-        * We can safely remove these entries, even if failure.
+        * We can safely remove these entries, even after failure.
         * Calling code will receive an error and will know that
         * the log facility has failed.
         */
@@ -603,8 +717,7 @@ static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
 
        rdata_size = sizeof(pkg);
        r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
-                                NULL, 0,
-                                (char *)&pkg, &rdata_size);
+                                NULL, 0, (char *)&pkg, &rdata_size);
 
        *region = pkg.r;
        return (r) ? r : (int)pkg.i;
@@ -630,8 +743,7 @@ static void userspace_set_region_sync(struct dm_dirty_log *log,
        pkg.i = (int64_t)in_sync;
 
        r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
-                                (char *)&pkg, sizeof(pkg),
-                                NULL, NULL);
+                                (char *)&pkg, sizeof(pkg), NULL, NULL);
 
        /*
         * It would be nice to be able to report failures.
@@ -657,8 +769,7 @@ static region_t userspace_get_sync_count(struct dm_dirty_log *log)
 
        rdata_size = sizeof(sync_count);
        r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
-                                NULL, 0,
-                                (char *)&sync_count, &rdata_size);
+                                NULL, 0, (char *)&sync_count, &rdata_size);
 
        if (r)
                return 0;
@@ -685,8 +796,7 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
        switch (status_type) {
        case STATUSTYPE_INFO:
                r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
-                                        NULL, 0,
-                                        result, &sz);
+                                        NULL, 0, result, &sz);
 
                if (r) {
                        sz = 0;
@@ -699,8 +809,10 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
                BUG_ON(!table_args); /* There will always be a ' ' */
                table_args++;
 
-               DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc,
-                      lc->uuid, table_args);
+               DMEMIT("%s %u %s ", log->type->name, lc->usr_argc, lc->uuid);
+               if (lc->integrated_flush)
+                       DMEMIT("integrated_flush ");
+               DMEMIT("%s ", table_args);
                break;
        }
        return (r) ? 0 : (int)sz;
index 0678c2adc42109dd6a88b823f4bc7dfa77c8fcdf..0fa0d9ef06a59b4e69cd63574519a58630c31774 100644 (file)
  * int (*flush)(struct dm_dirty_log *log);
  *
  * Payload-to-userspace:
- *     None.
+ *     If the 'integrated_flush' directive is present in the constructor
+ *     table, the payload is as same as DM_ULOG_MARK_REGION:
+ *             uint64_t [] - region(s) to mark
+ *     else
+ *             None
  * Payload-to-kernel:
  *     None.
  *
- * No incoming or outgoing payload.  Simply flush log state to disk.
+ * If the 'integrated_flush' option was used during the creation of the
+ * log, mark region requests are carried as payload in the flush request.
+ * Piggybacking the mark requests in this way allows for fewer communications
+ * between kernel and userspace.
  *
  * When the request has been processed, user-space must return the
  * dm_ulog_request to the kernel - setting the 'error' field and clearing
  *     version 2:  DM_ULOG_CTR allowed to return a string containing a
  *                 device name that is to be registered with DM via
  *                 'dm_get_device'.
+ *     version 3:  DM_ULOG_FLUSH is capable of carrying payload for marking
+ *                 regions.  This "integrated flush" reduces the number of
+ *                 requests between the kernel and userspace by effectively
+ *                 merging 'mark' and 'flush' requests.  A constructor table
+ *                 argument ('integrated_flush') is required to turn this
+ *                 feature on, so it is backwards compatible with older
+ *                 userspace versions.
  */
-#define DM_ULOG_REQUEST_VERSION 2
+#define DM_ULOG_REQUEST_VERSION 3
 
 struct dm_ulog_request {
        /*