block/blk-mq-sched.c

   1 /*
   2  * blk-mq scheduling framework
   3  *
   4  * Copyright (C) 2016 Jens Axboe
   5  */
   6 #include <linux/kernel.h>
   7 #include <linux/module.h>
   8 #include <linux/blk-mq.h>
   9
  10 #include <trace/events/block.h>
  11
  12 #include "blk.h"
  13 #include "blk-mq.h"
  14 #include "blk-mq-debugfs.h"
  15 #include "blk-mq-sched.h"
  16 #include "blk-mq-tag.h"
  17 #include "blk-wbt.h"
  18
  19 void blk_mq_sched_free_hctx_data(struct request_queue *q,
  20                                  void (*exit)(struct blk_mq_hw_ctx *))
  21 {
  22         struct blk_mq_hw_ctx *hctx;
  23         int i;
  24
  25         queue_for_each_hw_ctx(q, hctx, i) {
  26                 if (exit && hctx->sched_data)
  27                         exit(hctx);
  28                 kfree(hctx->sched_data);
  29                 hctx->sched_data = NULL;
  30         }
  31 }
  32 EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
  33
  34 static void __blk_mq_sched_assign_ioc(struct request_queue *q,
  35                                       struct request *rq,
  36                                       struct bio *bio,
  37                                       struct io_context *ioc)
  38 {
  39         struct io_cq *icq;
  40
  41         spin_lock_irq(q->queue_lock);
  42         icq = ioc_lookup_icq(ioc, q);
  43         spin_unlock_irq(q->queue_lock);
  44
  45         if (!icq) {
  46                 icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
  47                 if (!icq)
  48                         return;
  49         }
  50
  51         rq->elv.icq = icq;
  52         if (!blk_mq_sched_get_rq_priv(q, rq, bio)) {
  53                 rq->rq_flags |= RQF_ELVPRIV;
  54                 get_io_context(icq->ioc);
  55                 return;
  56         }
  57
  58         rq->elv.icq = NULL;
  59 }
  60
  61 static void blk_mq_sched_assign_ioc(struct request_queue *q,
  62                                     struct request *rq, struct bio *bio)
  63 {
  64         struct io_context *ioc;
  65
  66         ioc = rq_ioc(bio);
  67         if (ioc)
  68                 __blk_mq_sched_assign_ioc(q, rq, bio, ioc);
  69 }
  70
  71 struct request *blk_mq_sched_get_request(struct request_queue *q,
  72                                          struct bio *bio,
  73                                          unsigned int op,
  74                                          struct blk_mq_alloc_data *data)
  75 {
  76         struct elevator_queue *e = q->elevator;
  77         struct request *rq;
  78
  79         blk_queue_enter_live(q);
  80         data->q = q;
  81         if (likely(!data->ctx))
  82                 data->ctx = blk_mq_get_ctx(q);
  83         if (likely(!data->hctx))
  84                 data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
  85
  86         if (e) {
  87                 data->flags |= BLK_MQ_REQ_INTERNAL;
  88
  89                 /*
  90                  * Flush requests are special and go directly to the
  91                  * dispatch list.
  92                  */
  93                 if (!op_is_flush(op) && e->type->ops.mq.get_request) {
  94                         rq = e->type->ops.mq.get_request(q, op, data);
  95                         if (rq)
  96                                 rq->rq_flags |= RQF_QUEUED;
  97                 } else
  98                         rq = __blk_mq_alloc_request(data, op);
  99         } else {
 100                 rq = __blk_mq_alloc_request(data, op);
 101         }
 102
 103         if (rq) {
 104                 if (!op_is_flush(op)) {
 105                         rq->elv.icq = NULL;
 106                         if (e && e->type->icq_cache)
 107                                 blk_mq_sched_assign_ioc(q, rq, bio);
 108                 }
 109                 data->hctx->queued++;
 110                 return rq;
 111         }
 112
 113         blk_queue_exit(q);
 114         return NULL;
 115 }
 116
 117 void blk_mq_sched_put_request(struct request *rq)
 118 {
 119         struct request_queue *q = rq->q;
 120         struct elevator_queue *e = q->elevator;
 121
 122         if (rq->rq_flags & RQF_ELVPRIV) {
 123                 blk_mq_sched_put_rq_priv(rq->q, rq);
 124                 if (rq->elv.icq) {
 125                         put_io_context(rq->elv.icq->ioc);
 126                         rq->elv.icq = NULL;
 127                 }
 128         }
 129
 130         if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request)
 131                 e->type->ops.mq.put_request(rq);
 132         else
 133                 blk_mq_finish_request(rq);
 134 }
 135
 136 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 137 {
 138         struct request_queue *q = hctx->queue;
 139         struct elevator_queue *e = q->elevator;
 140         const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
 141         bool did_work = false;
 142         LIST_HEAD(rq_list);
 143
 144         if (unlikely(blk_mq_hctx_stopped(hctx)))
 145                 return;
 146
 147         hctx->run++;
 148
 149         /*
 150          * If we have previous entries on our dispatch list, grab them first for
 151          * more fair dispatch.
 152          */
 153         if (!list_empty_careful(&hctx->dispatch)) {
 154                 spin_lock(&hctx->lock);
 155                 if (!list_empty(&hctx->dispatch))
 156                         list_splice_init(&hctx->dispatch, &rq_list);
 157                 spin_unlock(&hctx->lock);
 158         }
 159
 160         /*
 161          * Only ask the scheduler for requests, if we didn't have residual
 162          * requests from the dispatch list. This is to avoid the case where
 163          * we only ever dispatch a fraction of the requests available because
 164          * of low device queue depth. Once we pull requests out of the IO
 165          * scheduler, we can no longer merge or sort them. So it's best to
 166          * leave them there for as long as we can. Mark the hw queue as
 167          * needing a restart in that case.
 168          */
 169         if (!list_empty(&rq_list)) {
 170                 blk_mq_sched_mark_restart_hctx(hctx);
 171                 did_work = blk_mq_dispatch_rq_list(q, &rq_list);
 172         } else if (!has_sched_dispatch) {
 173                 blk_mq_flush_busy_ctxs(hctx, &rq_list);
 174                 blk_mq_dispatch_rq_list(q, &rq_list);
 175         }
 176
 177         /*
 178          * We want to dispatch from the scheduler if we had no work left
 179          * on the dispatch list, OR if we did have work but weren't able
 180          * to make progress.
 181          */
 182         if (!did_work && has_sched_dispatch) {
 183                 do {
 184                         struct request *rq;
 185
 186                         rq = e->type->ops.mq.dispatch_request(hctx);
 187                         if (!rq)
 188                                 break;
 189                         list_add(&rq->queuelist, &rq_list);
 190                 } while (blk_mq_dispatch_rq_list(q, &rq_list));
 191         }
 192 }
 193
 194 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 195                             struct request **merged_request)
 196 {
 197         struct request *rq;
 198
 199         switch (elv_merge(q, &rq, bio)) {
 200         case ELEVATOR_BACK_MERGE:
 201                 if (!blk_mq_sched_allow_merge(q, rq, bio))
 202                         return false;
 203                 if (!bio_attempt_back_merge(q, rq, bio))
 204                         return false;
 205                 *merged_request = attempt_back_merge(q, rq);
 206                 if (!*merged_request)
 207                         elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
 208                 return true;
 209         case ELEVATOR_FRONT_MERGE:
 210                 if (!blk_mq_sched_allow_merge(q, rq, bio))
 211                         return false;
 212                 if (!bio_attempt_front_merge(q, rq, bio))
 213                         return false;
 214                 *merged_request = attempt_front_merge(q, rq);
 215                 if (!*merged_request)
 216                         elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
 217                 return true;
 218         default:
 219                 return false;
 220         }
 221 }
 222 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
 223
 224 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 225 {
 226         struct elevator_queue *e = q->elevator;
 227
 228         if (e->type->ops.mq.bio_merge) {
 229                 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
 230                 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 231
 232                 blk_mq_put_ctx(ctx);
 233                 return e->type->ops.mq.bio_merge(hctx, bio);
 234         }
 235
 236         return false;
 237 }
 238
 239 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
 240 {
 241         return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
 242 }
 243 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
 244
 245 void blk_mq_sched_request_inserted(struct request *rq)
 246 {
 247         trace_block_rq_insert(rq->q, rq);
 248 }
 249 EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
 250
 251 static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 252                                        struct request *rq)
 253 {
 254         if (rq->tag == -1) {
 255                 rq->rq_flags |= RQF_SORTED;
 256                 return false;
 257         }
 258
 259         /*
 260          * If we already have a real request tag, send directly to
 261          * the dispatch list.
 262          */
 263         spin_lock(&hctx->lock);
 264         list_add(&rq->queuelist, &hctx->dispatch);
 265         spin_unlock(&hctx->lock);
 266         return true;
 267 }
 268
 269 static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 270 {
 271         if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
 272                 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 273                 if (blk_mq_hctx_has_pending(hctx)) {
 274                         blk_mq_run_hw_queue(hctx, true);
 275                         return true;
 276                 }
 277         }
 278         return false;
 279 }
 280
 281 /**
 282  * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
 283  * @pos:    loop cursor.
 284  * @skip:   the list element that will not be examined. Iteration starts at
 285  *          @skip->next.
 286  * @head:   head of the list to examine. This list must have at least one
 287  *          element, namely @skip.
 288  * @member: name of the list_head structure within typeof(*pos).
 289  */
 290 #define list_for_each_entry_rcu_rr(pos, skip, head, member)             \
 291         for ((pos) = (skip);                                            \
 292              (pos = (pos)->member.next != (head) ? list_entry_rcu(      \
 293                         (pos)->member.next, typeof(*pos), member) :     \
 294               list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
 295              (pos) != (skip); )
 296
 297 /*
 298  * Called after a driver tag has been freed to check whether a hctx needs to
 299  * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
 300  * queues in a round-robin fashion if the tag set of @hctx is shared with other
 301  * hardware queues.
 302  */
 303 void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
 304 {
 305         struct blk_mq_tags *const tags = hctx->tags;
 306         struct blk_mq_tag_set *const set = hctx->queue->tag_set;
 307         struct request_queue *const queue = hctx->queue, *q;
 308         struct blk_mq_hw_ctx *hctx2;
 309         unsigned int i, j;
 310
 311         if (set->flags & BLK_MQ_F_TAG_SHARED) {
 312                 rcu_read_lock();
 313                 list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
 314                                            tag_set_list) {
 315                         queue_for_each_hw_ctx(q, hctx2, i)
 316                                 if (hctx2->tags == tags &&
 317                                     blk_mq_sched_restart_hctx(hctx2))
 318                                         goto done;
 319                 }
 320                 j = hctx->queue_num + 1;
 321                 for (i = 0; i < queue->nr_hw_queues; i++, j++) {
 322                         if (j == queue->nr_hw_queues)
 323                                 j = 0;
 324                         hctx2 = queue->queue_hw_ctx[j];
 325                         if (hctx2->tags == tags &&
 326                             blk_mq_sched_restart_hctx(hctx2))
 327                                 break;
 328                 }
 329 done:
 330                 rcu_read_unlock();
 331         } else {
 332                 blk_mq_sched_restart_hctx(hctx);
 333         }
 334 }
 335
 336 /*
 337  * Add flush/fua to the queue. If we fail getting a driver tag, then
 338  * punt to the requeue list. Requeue will re-invoke us from a context
 339  * that's safe to block from.
 340  */
 341 static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
 342                                       struct request *rq, bool can_block)
 343 {
 344         if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
 345                 blk_insert_flush(rq);
 346                 blk_mq_run_hw_queue(hctx, true);
 347         } else
 348                 blk_mq_add_to_requeue_list(rq, false, true);
 349 }
 350
 351 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 352                                  bool run_queue, bool async, bool can_block)
 353 {
 354         struct request_queue *q = rq->q;
 355         struct elevator_queue *e = q->elevator;
 356         struct blk_mq_ctx *ctx = rq->mq_ctx;
 357         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 358
 359         if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
 360                 blk_mq_sched_insert_flush(hctx, rq, can_block);
 361                 return;
 362         }
 363
 364         if (e && blk_mq_sched_bypass_insert(hctx, rq))
 365                 goto run;
 366
 367         if (e && e->type->ops.mq.insert_requests) {
 368                 LIST_HEAD(list);
 369
 370                 list_add(&rq->queuelist, &list);
 371                 e->type->ops.mq.insert_requests(hctx, &list, at_head);
 372         } else {
 373                 spin_lock(&ctx->lock);
 374                 __blk_mq_insert_request(hctx, rq, at_head);
 375                 spin_unlock(&ctx->lock);
 376         }
 377
 378 run:
 379         if (run_queue)
 380                 blk_mq_run_hw_queue(hctx, async);
 381 }
 382
 383 void blk_mq_sched_insert_requests(struct request_queue *q,
 384                                   struct blk_mq_ctx *ctx,
 385                                   struct list_head *list, bool run_queue_async)
 386 {
 387         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 388         struct elevator_queue *e = hctx->queue->elevator;
 389
 390         if (e) {
 391                 struct request *rq, *next;
 392
 393                 /*
 394                  * We bypass requests that already have a driver tag assigned,
 395                  * which should only be flushes. Flushes are only ever inserted
 396                  * as single requests, so we shouldn't ever hit the
 397                  * WARN_ON_ONCE() below (but let's handle it just in case).
 398                  */
 399                 list_for_each_entry_safe(rq, next, list, queuelist) {
 400                         if (WARN_ON_ONCE(rq->tag != -1)) {
 401                                 list_del_init(&rq->queuelist);
 402                                 blk_mq_sched_bypass_insert(hctx, rq);
 403                         }
 404                 }
 405         }
 406
 407         if (e && e->type->ops.mq.insert_requests)
 408                 e->type->ops.mq.insert_requests(hctx, list, false);
 409         else
 410                 blk_mq_insert_requests(hctx, ctx, list);
 411
 412         blk_mq_run_hw_queue(hctx, run_queue_async);
 413 }
 414
 415 static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
 416                                    struct blk_mq_hw_ctx *hctx,
 417                                    unsigned int hctx_idx)
 418 {
 419         if (hctx->sched_tags) {
 420                 blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
 421                 blk_mq_free_rq_map(hctx->sched_tags);
 422                 hctx->sched_tags = NULL;
 423         }
 424 }
 425
 426 static int blk_mq_sched_alloc_tags(struct request_queue *q,
 427                                    struct blk_mq_hw_ctx *hctx,
 428                                    unsigned int hctx_idx)
 429 {
 430         struct blk_mq_tag_set *set = q->tag_set;
 431         int ret;
 432
 433         hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
 434                                                set->reserved_tags);
 435         if (!hctx->sched_tags)
 436                 return -ENOMEM;
 437
 438         ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
 439         if (ret)
 440                 blk_mq_sched_free_tags(set, hctx, hctx_idx);
 441
 442         return ret;
 443 }
 444
 445 static void blk_mq_sched_tags_teardown(struct request_queue *q)
 446 {
 447         struct blk_mq_tag_set *set = q->tag_set;
 448         struct blk_mq_hw_ctx *hctx;
 449         int i;
 450
 451         queue_for_each_hw_ctx(q, hctx, i)
 452                 blk_mq_sched_free_tags(set, hctx, i);
 453 }
 454
 455 int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 456                            unsigned int hctx_idx)
 457 {
 458         struct elevator_queue *e = q->elevator;
 459         int ret;
 460
 461         if (!e)
 462                 return 0;
 463
 464         ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
 465         if (ret)
 466                 return ret;
 467
 468         if (e->type->ops.mq.init_hctx) {
 469                 ret = e->type->ops.mq.init_hctx(hctx, hctx_idx);
 470                 if (ret) {
 471                         blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
 472                         return ret;
 473                 }
 474         }
 475
 476         blk_mq_debugfs_register_sched_hctx(q, hctx);
 477
 478         return 0;
 479 }
 480
 481 void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 482                             unsigned int hctx_idx)
 483 {
 484         struct elevator_queue *e = q->elevator;
 485
 486         if (!e)
 487                 return;
 488
 489         blk_mq_debugfs_unregister_sched_hctx(hctx);
 490
 491         if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
 492                 e->type->ops.mq.exit_hctx(hctx, hctx_idx);
 493                 hctx->sched_data = NULL;
 494         }
 495
 496         blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
 497 }
 498
 499 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 500 {
 501         struct blk_mq_hw_ctx *hctx;
 502         struct elevator_queue *eq;
 503         unsigned int i;
 504         int ret;
 505
 506         if (!e) {
 507                 q->elevator = NULL;
 508                 return 0;
 509         }
 510
 511         /*
 512          * Default to 256, since we don't split into sync/async like the
 513          * old code did. Additionally, this is a per-hw queue depth.
 514          */
 515         q->nr_requests = 2 * BLKDEV_MAX_RQ;
 516
 517         queue_for_each_hw_ctx(q, hctx, i) {
 518                 ret = blk_mq_sched_alloc_tags(q, hctx, i);
 519                 if (ret)
 520                         goto err;
 521         }
 522
 523         ret = e->ops.mq.init_sched(q, e);
 524         if (ret)
 525                 goto err;
 526
 527         blk_mq_debugfs_register_sched(q);
 528
 529         queue_for_each_hw_ctx(q, hctx, i) {
 530                 if (e->ops.mq.init_hctx) {
 531                         ret = e->ops.mq.init_hctx(hctx, i);
 532                         if (ret) {
 533                                 eq = q->elevator;
 534                                 blk_mq_exit_sched(q, eq);
 535                                 kobject_put(&eq->kobj);
 536                                 return ret;
 537                         }
 538                 }
 539                 blk_mq_debugfs_register_sched_hctx(q, hctx);
 540         }
 541
 542         return 0;
 543
 544 err:
 545         blk_mq_sched_tags_teardown(q);
 546         q->elevator = NULL;
 547         return ret;
 548 }
 549
 550 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 551 {
 552         struct blk_mq_hw_ctx *hctx;
 553         unsigned int i;
 554
 555         queue_for_each_hw_ctx(q, hctx, i) {
 556                 blk_mq_debugfs_unregister_sched_hctx(hctx);
 557                 if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
 558                         e->type->ops.mq.exit_hctx(hctx, i);
 559                         hctx->sched_data = NULL;
 560                 }
 561         }
 562         blk_mq_debugfs_unregister_sched(q);
 563         if (e->type->ops.mq.exit_sched)
 564                 e->type->ops.mq.exit_sched(e);
 565         blk_mq_sched_tags_teardown(q);
 566         q->elevator = NULL;
 567 }
 568
 569 int blk_mq_sched_init(struct request_queue *q)
 570 {
 571         int ret;
 572
 573         mutex_lock(&q->sysfs_lock);
 574         ret = elevator_init(q, NULL);
 575         mutex_unlock(&q->sysfs_lock);
 576
 577         return ret;
 578 }