Merge branch 'for-4.6/core' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 18 Mar 2016 23:43:11 +0000 (16:43 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 18 Mar 2016 23:43:11 +0000 (16:43 -0700)
Pull core block updates from Jens Axboe:
 "Here are the core block changes for this merge window.  Not a lot of
  exciting stuff going on in this round, most of the changes have been
  on the driver side of things.  That pull request is coming next.  This
  pull request contains:

   - A set of fixes for chained bio handling from Christoph.

   - A tag bounds check for blk-mq from Hannes, ensuring that we don't
     do something stupid if a device reports an invalid tag value.

   - A set of fixes/updates for the CFQ IO scheduler from Jan Kara.

   - A set of blk-mq fixes from Keith, adding support for dynamic
     hardware queues, and fixing init of max_dev_sectors for stacking
     devices.

   - A fix for the dynamic hw context from Ming.

   - Enabling of cgroup writeback support on a block device, from
     Shaohua"

* 'for-4.6/core' of git://git.kernel.dk/linux-block:
  blk-mq: add bounds check on tag-to-rq conversion
  block: bio_remaining_done() isn't unlikely
  block: cleanup bio_endio
  block: factor out chained bio completion
  block: don't unecessarily clobber bi_error for chained bios
  block-dev: enable writeback cgroup support
  blk-mq: Fix NULL pointer updating nr_requests
  blk-mq: mark request queue as mq asap
  block: Initialize max_dev_sectors to 0
  blk-mq: dynamic h/w context count
  cfq-iosched: Allow parent cgroup to preempt its child
  cfq-iosched: Allow sync noidle workloads to preempt each other
  cfq-iosched: Reorder checks in cfq_should_preempt()
  cfq-iosched: Don't group_idle if cfqq has big thinktime

block/bio.c
block/blk-mq-sysfs.c
block/blk-mq.c
block/blk-mq.h
block/cfq-iosched.c
fs/block_dev.c
include/linux/blk-mq.h

index cf7591551b1716b74fa3765cb1c271a554e8ef56..f124a0a624fcbeea867865e7a53f634275dfb4b8 100644 (file)
@@ -296,13 +296,19 @@ void bio_reset(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_reset);
 
-static void bio_chain_endio(struct bio *bio)
+static struct bio *__bio_chain_endio(struct bio *bio)
 {
        struct bio *parent = bio->bi_private;
 
-       parent->bi_error = bio->bi_error;
-       bio_endio(parent);
+       if (!parent->bi_error)
+               parent->bi_error = bio->bi_error;
        bio_put(bio);
+       return parent;
+}
+
+static void bio_chain_endio(struct bio *bio)
+{
+       bio_endio(__bio_chain_endio(bio));
 }
 
 /*
@@ -1742,29 +1748,25 @@ static inline bool bio_remaining_done(struct bio *bio)
  **/
 void bio_endio(struct bio *bio)
 {
-       while (bio) {
-               if (unlikely(!bio_remaining_done(bio)))
-                       break;
+again:
+       if (!bio_remaining_done(bio))
+               return;
 
-               /*
-                * Need to have a real endio function for chained bios,
-                * otherwise various corner cases will break (like stacking
-                * block devices that save/restore bi_end_io) - however, we want
-                * to avoid unbounded recursion and blowing the stack. Tail call
-                * optimization would handle this, but compiling with frame
-                * pointers also disables gcc's sibling call optimization.
-                */
-               if (bio->bi_end_io == bio_chain_endio) {
-                       struct bio *parent = bio->bi_private;
-                       parent->bi_error = bio->bi_error;
-                       bio_put(bio);
-                       bio = parent;
-               } else {
-                       if (bio->bi_end_io)
-                               bio->bi_end_io(bio);
-                       bio = NULL;
-               }
+       /*
+        * Need to have a real endio function for chained bios, otherwise
+        * various corner cases will break (like stacking block devices that
+        * save/restore bi_end_io) - however, we want to avoid unbounded
+        * recursion and blowing the stack. Tail call optimization would
+        * handle this, but compiling with frame pointers also disables
+        * gcc's sibling call optimization.
+        */
+       if (bio->bi_end_io == bio_chain_endio) {
+               bio = __bio_chain_endio(bio);
+               goto again;
        }
+
+       if (bio->bi_end_io)
+               bio->bi_end_io(bio);
 }
 EXPORT_SYMBOL(bio_endio);
 
index 1cf18784c5cf3c44be94dbd003ca9d7088f883e0..431fdda21737cb91b9a5a0f49fd1c2ccd2cfae7c 100644 (file)
@@ -408,17 +408,18 @@ void blk_mq_unregister_disk(struct gendisk *disk)
        blk_mq_enable_hotplug();
 }
 
+void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
+{
+       kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
+}
+
 static void blk_mq_sysfs_init(struct request_queue *q)
 {
-       struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
        int i;
 
        kobject_init(&q->mq_kobj, &blk_mq_ktype);
 
-       queue_for_each_hw_ctx(q, hctx, i)
-               kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
-
        queue_for_each_ctx(q, ctx, i)
                kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
 }
index 56c0a726b619374ec9f8cf4b60649a14919f0625..050f7a13021baca7347df79d8d26dad2b89e327e 100644 (file)
@@ -544,7 +544,10 @@ EXPORT_SYMBOL(blk_mq_abort_requeue_list);
 
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
-       return tags->rqs[tag];
+       if (tag < tags->nr_tags)
+               return tags->rqs[tag];
+
+       return NULL;
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
@@ -1744,31 +1747,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
        return -1;
 }
 
-static int blk_mq_init_hw_queues(struct request_queue *q,
-               struct blk_mq_tag_set *set)
-{
-       struct blk_mq_hw_ctx *hctx;
-       unsigned int i;
-
-       /*
-        * Initialize hardware queues
-        */
-       queue_for_each_hw_ctx(q, hctx, i) {
-               if (blk_mq_init_hctx(q, set, hctx, i))
-                       break;
-       }
-
-       if (i == q->nr_hw_queues)
-               return 0;
-
-       /*
-        * Init failed
-        */
-       blk_mq_exit_hw_queues(q, set, i);
-
-       return 1;
-}
-
 static void blk_mq_init_cpu_queues(struct request_queue *q,
                                   unsigned int nr_hw_queues)
 {
@@ -1826,6 +1804,7 @@ static void blk_mq_map_swqueue(struct request_queue *q,
                        continue;
 
                hctx = q->mq_ops->map_queue(q, i);
+
                cpumask_set_cpu(i, hctx->cpumask);
                ctx->index_hw = hctx->nr_ctx;
                hctx->ctxs[hctx->nr_ctx++] = ctx;
@@ -1974,56 +1953,93 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 }
 EXPORT_SYMBOL(blk_mq_init_queue);
 
-struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
-                                                 struct request_queue *q)
+static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
+                                               struct request_queue *q)
 {
-       struct blk_mq_hw_ctx **hctxs;
-       struct blk_mq_ctx __percpu *ctx;
-       unsigned int *map;
-       int i;
-
-       ctx = alloc_percpu(struct blk_mq_ctx);
-       if (!ctx)
-               return ERR_PTR(-ENOMEM);
-
-       hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
-                       set->numa_node);
-
-       if (!hctxs)
-               goto err_percpu;
-
-       map = blk_mq_make_queue_map(set);
-       if (!map)
-               goto err_map;
+       int i, j;
+       struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
 
+       blk_mq_sysfs_unregister(q);
        for (i = 0; i < set->nr_hw_queues; i++) {
-               int node = blk_mq_hw_queue_to_node(map, i);
+               int node;
 
+               if (hctxs[i])
+                       continue;
+
+               node = blk_mq_hw_queue_to_node(q->mq_map, i);
                hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
                                        GFP_KERNEL, node);
                if (!hctxs[i])
-                       goto err_hctxs;
+                       break;
 
                if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
-                                               node))
-                       goto err_hctxs;
+                                               node)) {
+                       kfree(hctxs[i]);
+                       hctxs[i] = NULL;
+                       break;
+               }
 
                atomic_set(&hctxs[i]->nr_active, 0);
                hctxs[i]->numa_node = node;
                hctxs[i]->queue_num = i;
+
+               if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
+                       free_cpumask_var(hctxs[i]->cpumask);
+                       kfree(hctxs[i]);
+                       hctxs[i] = NULL;
+                       break;
+               }
+               blk_mq_hctx_kobj_init(hctxs[i]);
        }
+       for (j = i; j < q->nr_hw_queues; j++) {
+               struct blk_mq_hw_ctx *hctx = hctxs[j];
+
+               if (hctx) {
+                       if (hctx->tags) {
+                               blk_mq_free_rq_map(set, hctx->tags, j);
+                               set->tags[j] = NULL;
+                       }
+                       blk_mq_exit_hctx(q, set, hctx, j);
+                       free_cpumask_var(hctx->cpumask);
+                       kobject_put(&hctx->kobj);
+                       kfree(hctx->ctxs);
+                       kfree(hctx);
+                       hctxs[j] = NULL;
+
+               }
+       }
+       q->nr_hw_queues = i;
+       blk_mq_sysfs_register(q);
+}
+
+struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
+                                                 struct request_queue *q)
+{
+       /* mark the queue as mq asap */
+       q->mq_ops = set->ops;
+
+       q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
+       if (!q->queue_ctx)
+               return ERR_PTR(-ENOMEM);
+
+       q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)),
+                                               GFP_KERNEL, set->numa_node);
+       if (!q->queue_hw_ctx)
+               goto err_percpu;
+
+       q->mq_map = blk_mq_make_queue_map(set);
+       if (!q->mq_map)
+               goto err_map;
+
+       blk_mq_realloc_hw_ctxs(set, q);
+       if (!q->nr_hw_queues)
+               goto err_hctxs;
 
        INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
        blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
 
        q->nr_queues = nr_cpu_ids;
-       q->nr_hw_queues = set->nr_hw_queues;
-       q->mq_map = map;
-
-       q->queue_ctx = ctx;
-       q->queue_hw_ctx = hctxs;
 
-       q->mq_ops = set->ops;
        q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 
        if (!(set->flags & BLK_MQ_F_SG_MERGE))
@@ -2050,9 +2066,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
        blk_mq_init_cpu_queues(q, set->nr_hw_queues);
 
-       if (blk_mq_init_hw_queues(q, set))
-               goto err_hctxs;
-
        get_online_cpus();
        mutex_lock(&all_q_mutex);
 
@@ -2066,17 +2079,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
        return q;
 
 err_hctxs:
-       kfree(map);
-       for (i = 0; i < set->nr_hw_queues; i++) {
-               if (!hctxs[i])
-                       break;
-               free_cpumask_var(hctxs[i]->cpumask);
-               kfree(hctxs[i]);
-       }
+       kfree(q->mq_map);
 err_map:
-       kfree(hctxs);
+       kfree(q->queue_hw_ctx);
 err_percpu:
-       free_percpu(ctx);
+       free_percpu(q->queue_ctx);
        return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL(blk_mq_init_allocated_queue);
@@ -2284,9 +2291,13 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
                set->nr_hw_queues = 1;
                set->queue_depth = min(64U, set->queue_depth);
        }
+       /*
+        * There is no use for more h/w queues than cpus.
+        */
+       if (set->nr_hw_queues > nr_cpu_ids)
+               set->nr_hw_queues = nr_cpu_ids;
 
-       set->tags = kmalloc_node(set->nr_hw_queues *
-                                sizeof(struct blk_mq_tags *),
+       set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
                                 GFP_KERNEL, set->numa_node);
        if (!set->tags)
                return -ENOMEM;
@@ -2309,7 +2320,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 {
        int i;
 
-       for (i = 0; i < set->nr_hw_queues; i++) {
+       for (i = 0; i < nr_cpu_ids; i++) {
                if (set->tags[i])
                        blk_mq_free_rq_map(set, set->tags[i], i);
        }
@@ -2330,6 +2341,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 
        ret = 0;
        queue_for_each_hw_ctx(q, hctx, i) {
+               if (!hctx->tags)
+                       continue;
                ret = blk_mq_tag_update_depth(hctx->tags, nr);
                if (ret)
                        break;
@@ -2341,6 +2354,35 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
        return ret;
 }
 
+void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
+{
+       struct request_queue *q;
+
+       if (nr_hw_queues > nr_cpu_ids)
+               nr_hw_queues = nr_cpu_ids;
+       if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
+               return;
+
+       list_for_each_entry(q, &set->tag_list, tag_set_list)
+               blk_mq_freeze_queue(q);
+
+       set->nr_hw_queues = nr_hw_queues;
+       list_for_each_entry(q, &set->tag_list, tag_set_list) {
+               blk_mq_realloc_hw_ctxs(set, q);
+
+               if (q->nr_hw_queues > 1)
+                       blk_queue_make_request(q, blk_mq_make_request);
+               else
+                       blk_queue_make_request(q, blk_sq_make_request);
+
+               blk_mq_queue_reinit(q, cpu_online_mask);
+       }
+
+       list_for_each_entry(q, &set->tag_list, tag_set_list)
+               blk_mq_unfreeze_queue(q);
+}
+EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
+
 void blk_mq_disable_hotplug(void)
 {
        mutex_lock(&all_q_mutex);
index eaede8e45c9c3e5f2555ea86ec39492e1c042e59..9087b11037b70ae514fd46ea10f659152f291aa2 100644 (file)
@@ -57,6 +57,7 @@ extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
  */
 extern int blk_mq_sysfs_register(struct request_queue *q);
 extern void blk_mq_sysfs_unregister(struct request_queue *q);
+extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
 
 extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
 
index 1f9093e901daed7849a54633be5d80ce96b010f4..e3c591dd8f19d0b46fe42dc842510f9d220af143 100644 (file)
@@ -632,6 +632,13 @@ static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
        return pblkg ? blkg_to_cfqg(pblkg) : NULL;
 }
 
+static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
+                                     struct cfq_group *ancestor)
+{
+       return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup,
+                                   cfqg_to_blkg(ancestor)->blkcg->css.cgroup);
+}
+
 static inline void cfqg_get(struct cfq_group *cfqg)
 {
        return blkg_get(cfqg_to_blkg(cfqg));
@@ -758,6 +765,11 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
 #else  /* CONFIG_CFQ_GROUP_IOSCHED */
 
 static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
+static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
+                                     struct cfq_group *ancestor)
+{
+       return true;
+}
 static inline void cfqg_get(struct cfq_group *cfqg) { }
 static inline void cfqg_put(struct cfq_group *cfqg) { }
 
@@ -2897,6 +2909,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 {
        struct cfq_queue *cfqq = cfqd->active_queue;
+       struct cfq_rb_root *st = cfqq->service_tree;
        struct cfq_io_cq *cic;
        unsigned long sl, group_idle = 0;
 
@@ -2947,8 +2960,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
                return;
        }
 
-       /* There are other queues in the group, don't do group idle */
-       if (group_idle && cfqq->cfqg->nr_cfqq > 1)
+       /*
+        * There are other queues in the group or this is the only group and
+        * it has too big thinktime, don't do group idle.
+        */
+       if (group_idle &&
+           (cfqq->cfqg->nr_cfqq > 1 ||
+            cfq_io_thinktime_big(cfqd, &st->ttime, true)))
                return;
 
        cfq_mark_cfqq_wait_request(cfqq);
@@ -3947,16 +3965,27 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
        if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
                return true;
 
-       if (new_cfqq->cfqg != cfqq->cfqg)
+       /*
+        * Treat ancestors of current cgroup the same way as current cgroup.
+        * For anybody else we disallow preemption to guarantee service
+        * fairness among cgroups.
+        */
+       if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg))
                return false;
 
        if (cfq_slice_used(cfqq))
                return true;
 
+       /*
+        * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
+        */
+       if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
+               return true;
+
+       WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class);
        /* Allow preemption only if we are idling on sync-noidle tree */
        if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
            cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
-           new_cfqq->service_tree->count == 2 &&
            RB_EMPTY_ROOT(&cfqq->sort_list))
                return true;
 
@@ -3967,12 +3996,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
        if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
                return true;
 
-       /*
-        * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
-        */
-       if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
-               return true;
-
        /* An idle queue should not be idle now for some reason */
        if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
                return true;
index 826b164a4b5b1faa9719aefc9cceccec20eda479..3172c4e2f50255e1cb1fb426c1723a6b7b493bb2 100644 (file)
@@ -575,7 +575,11 @@ static const struct super_operations bdev_sops = {
 static struct dentry *bd_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
-       return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
+       struct dentry *dent;
+       dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
+       if (dent)
+               dent->d_sb->s_iflags |= SB_I_CGROUPWB;
+       return dent;
 }
 
 static struct file_system_type bd_type = {
index 7fc9296b574290e768dbb38585e725525802940b..15a73d49fd1d54a4b401bf3a042b466de6f311c8 100644 (file)
@@ -244,6 +244,8 @@ void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_unfreeze_queue(struct request_queue *q);
 void blk_mq_freeze_queue_start(struct request_queue *q);
 
+void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
+
 /*
  * Driver command data is immediately after the request. So subtract request
  * size to get back to the original request, add request size to get the PDU.