blk-mq: init hctx sched after update ctx and hctx mapping
authorJianchao Wang <jianchao.w.wang@oracle.com>
Tue, 21 Aug 2018 07:15:03 +0000 (15:15 +0800)
committerJens Axboe <axboe@kernel.dk>
Tue, 21 Aug 2018 15:02:55 +0000 (09:02 -0600)
Currently, when update nr_hw_queues, IO scheduler's init_hctx will
be invoked before the mapping between ctx and hctx is adapted
correctly by blk_mq_map_swqueue. The IO scheduler init_hctx (kyber)
may depend on this mapping and get wrong result and panic finally.
A simply way to fix this is that switch the IO scheduler to 'none'
before update the nr_hw_queues, and then switch it back after
update nr_hw_queues. blk_mq_sched_init_/exit_hctx are removed due
to nobody use them any more.

Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
block/blk-mq-sched.c
block/blk-mq-sched.h
block/blk-mq.c
block/blk.h
block/elevator.c

index cf9c66c6d35a866c63a8a1785eeab769ec3298d4..29bfe8017a2d8e6cbadeab6b9d1d63d293f656a5 100644 (file)
@@ -462,50 +462,6 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q)
                blk_mq_sched_free_tags(set, hctx, i);
 }
 
-int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
-                          unsigned int hctx_idx)
-{
-       struct elevator_queue *e = q->elevator;
-       int ret;
-
-       if (!e)
-               return 0;
-
-       ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
-       if (ret)
-               return ret;
-
-       if (e->type->ops.mq.init_hctx) {
-               ret = e->type->ops.mq.init_hctx(hctx, hctx_idx);
-               if (ret) {
-                       blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
-                       return ret;
-               }
-       }
-
-       blk_mq_debugfs_register_sched_hctx(q, hctx);
-
-       return 0;
-}
-
-void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
-                           unsigned int hctx_idx)
-{
-       struct elevator_queue *e = q->elevator;
-
-       if (!e)
-               return;
-
-       blk_mq_debugfs_unregister_sched_hctx(hctx);
-
-       if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
-               e->type->ops.mq.exit_hctx(hctx, hctx_idx);
-               hctx->sched_data = NULL;
-       }
-
-       blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
-}
-
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 {
        struct blk_mq_hw_ctx *hctx;
index 0cb8f938dff9d6a9b2fd3f6b519790c6f9f7b805..4e028ee4243014ff8eed44627f8e0cc5068217ca 100644 (file)
@@ -28,11 +28,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
 
-int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
-                          unsigned int hctx_idx);
-void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
-                           unsigned int hctx_idx);
-
 static inline bool
 blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 {
index 5efd789910e25c1c187c8b36c8686bfb5a47a555..9c8c8c71a13f1683524e1b74433fd897cbeb9fcb 100644 (file)
@@ -2147,8 +2147,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
        if (set->ops->exit_request)
                set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
 
-       blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
-
        if (set->ops->exit_hctx)
                set->ops->exit_hctx(hctx, hctx_idx);
 
@@ -2216,12 +2214,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
            set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
                goto free_bitmap;
 
-       if (blk_mq_sched_init_hctx(q, hctx, hctx_idx))
-               goto exit_hctx;
-
        hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
        if (!hctx->fq)
-               goto sched_exit_hctx;
+               goto exit_hctx;
 
        if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
                goto free_fq;
@@ -2235,8 +2230,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
 
  free_fq:
        kfree(hctx->fq);
- sched_exit_hctx:
-       blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
  exit_hctx:
        if (set->ops->exit_hctx)
                set->ops->exit_hctx(hctx, hctx_idx);
@@ -2898,10 +2891,81 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
        return ret;
 }
 
+/*
+ * request_queue and elevator_type pair.
+ * It is just used by __blk_mq_update_nr_hw_queues to cache
+ * the elevator_type associated with a request_queue.
+ */
+struct blk_mq_qe_pair {
+       struct list_head node;
+       struct request_queue *q;
+       struct elevator_type *type;
+};
+
+/*
+ * Cache the elevator_type in qe pair list and switch the
+ * io scheduler to 'none'
+ */
+static bool blk_mq_elv_switch_none(struct list_head *head,
+               struct request_queue *q)
+{
+       struct blk_mq_qe_pair *qe;
+
+       if (!q->elevator)
+               return true;
+
+       qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
+       if (!qe)
+               return false;
+
+       INIT_LIST_HEAD(&qe->node);
+       qe->q = q;
+       qe->type = q->elevator->type;
+       list_add(&qe->node, head);
+
+       mutex_lock(&q->sysfs_lock);
+       /*
+        * After elevator_switch_mq, the previous elevator_queue will be
+        * released by elevator_release. The reference of the io scheduler
+        * module get by elevator_get will also be put. So we need to get
+        * a reference of the io scheduler module here to prevent it to be
+        * removed.
+        */
+       __module_get(qe->type->elevator_owner);
+       elevator_switch_mq(q, NULL);
+       mutex_unlock(&q->sysfs_lock);
+
+       return true;
+}
+
+static void blk_mq_elv_switch_back(struct list_head *head,
+               struct request_queue *q)
+{
+       struct blk_mq_qe_pair *qe;
+       struct elevator_type *t = NULL;
+
+       list_for_each_entry(qe, head, node)
+               if (qe->q == q) {
+                       t = qe->type;
+                       break;
+               }
+
+       if (!t)
+               return;
+
+       list_del(&qe->node);
+       kfree(qe);
+
+       mutex_lock(&q->sysfs_lock);
+       elevator_switch_mq(q, t);
+       mutex_unlock(&q->sysfs_lock);
+}
+
 static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
                                                        int nr_hw_queues)
 {
        struct request_queue *q;
+       LIST_HEAD(head);
 
        lockdep_assert_held(&set->tag_list_lock);
 
@@ -2912,6 +2976,14 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
 
        list_for_each_entry(q, &set->tag_list, tag_set_list)
                blk_mq_freeze_queue(q);
+       /*
+        * Switch IO scheduler to 'none', cleaning up the data associated
+        * with the previous scheduler. We will switch back once we are done
+        * updating the new sw to hw queue mappings.
+        */
+       list_for_each_entry(q, &set->tag_list, tag_set_list)
+               if (!blk_mq_elv_switch_none(&head, q))
+                       goto switch_back;
 
        set->nr_hw_queues = nr_hw_queues;
        blk_mq_update_queue_map(set);
@@ -2920,6 +2992,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
                blk_mq_queue_reinit(q);
        }
 
+switch_back:
+       list_for_each_entry(q, &set->tag_list, tag_set_list)
+               blk_mq_elv_switch_back(&head, q);
+
        list_for_each_entry(q, &set->tag_list, tag_set_list)
                blk_mq_unfreeze_queue(q);
 }
index 644975e85053a90810dd0afa8c9117f076391b00..9db4e389582c8da7848d458a9d4f12d64a1aecae 100644 (file)
@@ -234,6 +234,8 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq
 
 int elevator_init(struct request_queue *);
 int elevator_init_mq(struct request_queue *q);
+int elevator_switch_mq(struct request_queue *q,
+                             struct elevator_type *new_e);
 void elevator_exit(struct request_queue *, struct elevator_queue *);
 int elv_register_queue(struct request_queue *q);
 void elv_unregister_queue(struct request_queue *q);
index fa828b5bfd4b14c977b2d3cdea41ff12f1573344..5ea6e7d600e46edcd1eec808324640ab8b7099ed 100644 (file)
@@ -933,16 +933,13 @@ void elv_unregister(struct elevator_type *e)
 }
 EXPORT_SYMBOL_GPL(elv_unregister);
 
-static int elevator_switch_mq(struct request_queue *q,
+int elevator_switch_mq(struct request_queue *q,
                              struct elevator_type *new_e)
 {
        int ret;
 
        lockdep_assert_held(&q->sysfs_lock);
 
-       blk_mq_freeze_queue(q);
-       blk_mq_quiesce_queue(q);
-
        if (q->elevator) {
                if (q->elevator->registered)
                        elv_unregister_queue(q);
@@ -968,8 +965,6 @@ static int elevator_switch_mq(struct request_queue *q,
                blk_add_trace_msg(q, "elv switch: none");
 
 out:
-       blk_mq_unquiesce_queue(q);
-       blk_mq_unfreeze_queue(q);
        return ret;
 }
 
@@ -1021,8 +1016,17 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 
        lockdep_assert_held(&q->sysfs_lock);
 
-       if (q->mq_ops)
-               return elevator_switch_mq(q, new_e);
+       if (q->mq_ops) {
+               blk_mq_freeze_queue(q);
+               blk_mq_quiesce_queue(q);
+
+               err = elevator_switch_mq(q, new_e);
+
+               blk_mq_unquiesce_queue(q);
+               blk_mq_unfreeze_queue(q);
+
+               return err;
+       }
 
        /*
         * Turn on BYPASS and drain all requests w/ elevator private data.