nvme-multipath: round-robin I/O policy
authorHannes Reinecke <hare@suse.de>
Mon, 18 Feb 2019 10:43:26 +0000 (11:43 +0100)
committerChristoph Hellwig <hch@lst.de>
Wed, 20 Feb 2019 14:17:49 +0000 (07:17 -0700)
Implement a simple round-robin I/O policy for multipathing.  Path
selection is done in two rounds, first iterating across all optimized
paths, and if that doesn't return any valid paths, iterate over all
optimized and non-optimized paths.  If no paths are found, use the
existing algorithm.  Also add a sysfs attribute 'iopolicy' to switch
between the current NUMA-aware I/O policy and the 'round-robin' I/O
policy.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
drivers/nvme/host/core.c
drivers/nvme/host/multipath.c
drivers/nvme/host/nvme.h

index 34758cca7836307a06424b321bb9bfb0e9d84ee0..cba58d995b3021e59eb94cb3fd9c09e393c59009 100644 (file)
@@ -2328,6 +2328,9 @@ static struct attribute *nvme_subsys_attrs[] = {
        &subsys_attr_serial.attr,
        &subsys_attr_firmware_rev.attr,
        &subsys_attr_subsysnqn.attr,
+#ifdef CONFIG_NVME_MULTIPATH
+       &subsys_attr_iopolicy.attr,
+#endif
        NULL,
 };
 
@@ -2380,6 +2383,9 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
        memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
        subsys->vendor_id = le16_to_cpu(id->vid);
        subsys->cmic = id->cmic;
+#ifdef CONFIG_NVME_MULTIPATH
+       subsys->iopolicy = NVME_IOPOLICY_NUMA;
+#endif
 
        subsys->dev.class = nvme_subsys_class;
        subsys->dev.release = nvme_release_subsystem;
index b9fff3b8ed1b1dd180b50de141bbc3d2af73a485..1f7fe1bd2936fb4a206e23f032f2d7e3698511b1 100644 (file)
@@ -141,7 +141,10 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
                    test_bit(NVME_NS_ANA_PENDING, &ns->flags))
                        continue;
 
-               distance = node_distance(node, ns->ctrl->numa_node);
+               if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
+                       distance = node_distance(node, ns->ctrl->numa_node);
+               else
+                       distance = LOCAL_DISTANCE;
 
                switch (ns->ana_state) {
                case NVME_ANA_OPTIMIZED:
@@ -168,6 +171,47 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
        return found;
 }
 
+static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
+               struct nvme_ns *ns)
+{
+       ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
+                       siblings);
+       if (ns)
+               return ns;
+       return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
+}
+
+static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
+               int node, struct nvme_ns *old)
+{
+       struct nvme_ns *ns, *found, *fallback = NULL;
+
+       if (list_is_singular(&head->list))
+               return old;
+
+       for (ns = nvme_next_ns(head, old);
+            ns != old;
+            ns = nvme_next_ns(head, ns)) {
+               if (ns->ctrl->state != NVME_CTRL_LIVE ||
+                   test_bit(NVME_NS_ANA_PENDING, &ns->flags))
+                       continue;
+
+               if (ns->ana_state == NVME_ANA_OPTIMIZED) {
+                       found = ns;
+                       goto out;
+               }
+               if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
+                       fallback = ns;
+       }
+
+       if (!fallback)
+               return NULL;
+       found = fallback;
+out:
+       rcu_assign_pointer(head->current_path[node], found);
+       return found;
+}
+
 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 {
        return ns->ctrl->state == NVME_CTRL_LIVE &&
@@ -180,6 +224,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
        struct nvme_ns *ns;
 
        ns = srcu_dereference(head->current_path[node], &head->srcu);
+       if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns)
+               ns = nvme_round_robin_path(head, node, ns);
        if (unlikely(!ns || !nvme_path_is_optimized(ns)))
                ns = __nvme_find_path(head, node);
        return ns;
@@ -471,6 +517,44 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl)
        cancel_work_sync(&ctrl->ana_work);
 }
 
+#define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
+       struct device_attribute subsys_attr_##_name =   \
+               __ATTR(_name, _mode, _show, _store)
+
+static const char *nvme_iopolicy_names[] = {
+       [NVME_IOPOLICY_NUMA]    = "numa",
+       [NVME_IOPOLICY_RR]      = "round-robin",
+};
+
+static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct nvme_subsystem *subsys =
+               container_of(dev, struct nvme_subsystem, dev);
+
+       return sprintf(buf, "%s\n",
+                       nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
+}
+
+static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
+               struct device_attribute *attr, const char *buf, size_t count)
+{
+       struct nvme_subsystem *subsys =
+               container_of(dev, struct nvme_subsystem, dev);
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
+               if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
+                       WRITE_ONCE(subsys->iopolicy, i);
+                       return count;
+               }
+       }
+
+       return -EINVAL;
+}
+SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
+                     nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
+
 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
                char *buf)
 {
index 23db2d99b53a6270753f6da9fc0133163c3670ab..8c646ab26677858523247374efc5a09e539956c5 100644 (file)
@@ -252,6 +252,11 @@ struct nvme_ctrl {
        unsigned long discard_page_busy;
 };
 
+enum nvme_iopolicy {
+       NVME_IOPOLICY_NUMA,
+       NVME_IOPOLICY_RR,
+};
+
 struct nvme_subsystem {
        int                     instance;
        struct device           dev;
@@ -271,6 +276,9 @@ struct nvme_subsystem {
        u8                      cmic;
        u16                     vendor_id;
        struct ida              ns_ida;
+#ifdef CONFIG_NVME_MULTIPATH
+       enum nvme_iopolicy      iopolicy;
+#endif
 };
 
 /*
@@ -491,6 +499,7 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
 
 extern struct device_attribute dev_attr_ana_grpid;
 extern struct device_attribute dev_attr_ana_state;
+extern struct device_attribute subsys_attr_iopolicy;
 
 #else
 static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)