block/blk-cgroup.c

   1 /*
   2  * Common Block IO controller cgroup interface
   3  *
   4  * Based on ideas and code from CFQ, CFS and BFQ:
   5  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   6  *
   7  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
   8  *                    Paolo Valente <paolo.valente@unimore.it>
   9  *
  10  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  11  *                    Nauman Rafique <nauman@google.com>
  12  *
  13  * For policy-specific per-blkcg data:
  14  * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
  15  *                    Arianna Avanzini <avanzini.arianna@gmail.com>
  16  */
  17 #include <linux/ioprio.h>
  18 #include <linux/kdev_t.h>
  19 #include <linux/module.h>
  20 #include <linux/sched/signal.h>
  21 #include <linux/err.h>
  22 #include <linux/blkdev.h>
  23 #include <linux/backing-dev.h>
  24 #include <linux/slab.h>
  25 #include <linux/genhd.h>
  26 #include <linux/delay.h>
  27 #include <linux/atomic.h>
  28 #include <linux/ctype.h>
  29 #include <linux/blk-cgroup.h>
  30 #include <linux/tracehook.h>
  31 #include "blk.h"
  32
  33 #define MAX_KEY_LEN 100
  34
  35 /*
  36  * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
  37  * blkcg_pol_register_mutex nests outside of it and synchronizes entire
  38  * policy [un]register operations including cgroup file additions /
  39  * removals.  Putting cgroup file registration outside blkcg_pol_mutex
  40  * allows grabbing it from cgroup callbacks.
  41  */
  42 static DEFINE_MUTEX(blkcg_pol_register_mutex);
  43 static DEFINE_MUTEX(blkcg_pol_mutex);
  44
  45 struct blkcg blkcg_root;
  46 EXPORT_SYMBOL_GPL(blkcg_root);
  47
  48 struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
  49
  50 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
  51
  52 static LIST_HEAD(all_blkcgs);           /* protected by blkcg_pol_mutex */
  53
  54 static bool blkcg_debug_stats = false;
  55
  56 static bool blkcg_policy_enabled(struct request_queue *q,
  57                                  const struct blkcg_policy *pol)
  58 {
  59         return pol && test_bit(pol->plid, q->blkcg_pols);
  60 }
  61
  62 /**
  63  * blkg_free - free a blkg
  64  * @blkg: blkg to free
  65  *
  66  * Free @blkg which may be partially allocated.
  67  */
  68 static void blkg_free(struct blkcg_gq *blkg)
  69 {
  70         int i;
  71
  72         if (!blkg)
  73                 return;
  74
  75         for (i = 0; i < BLKCG_MAX_POLS; i++)
  76                 if (blkg->pd[i])
  77                         blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
  78
  79         if (blkg->blkcg != &blkcg_root)
  80                 blk_exit_rl(blkg->q, &blkg->rl);
  81
  82         blkg_rwstat_exit(&blkg->stat_ios);
  83         blkg_rwstat_exit(&blkg->stat_bytes);
  84         kfree(blkg);
  85 }
  86
  87 static void __blkg_release(struct rcu_head *rcu)
  88 {
  89         struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
  90
  91         percpu_ref_exit(&blkg->refcnt);
  92
  93         /* release the blkcg and parent blkg refs this blkg has been holding */
  94         css_put(&blkg->blkcg->css);
  95         if (blkg->parent)
  96                 blkg_put(blkg->parent);
  97
  98         wb_congested_put(blkg->wb_congested);
  99
 100         blkg_free(blkg);
 101 }
 102
 103 /*
 104  * A group is RCU protected, but having an rcu lock does not mean that one
 105  * can access all the fields of blkg and assume these are valid.  For
 106  * example, don't try to follow throtl_data and request queue links.
 107  *
 108  * Having a reference to blkg under an rcu allows accesses to only values
 109  * local to groups like group stats and group rate limits.
 110  */
 111 static void blkg_release(struct percpu_ref *ref)
 112 {
 113         struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
 114
 115         call_rcu(&blkg->rcu_head, __blkg_release);
 116 }
 117
 118 /**
 119  * blkg_alloc - allocate a blkg
 120  * @blkcg: block cgroup the new blkg is associated with
 121  * @q: request_queue the new blkg is associated with
 122  * @gfp_mask: allocation mask to use
 123  *
 124  * Allocate a new blkg assocating @blkcg and @q.
 125  */
 126 static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 127                                    gfp_t gfp_mask)
 128 {
 129         struct blkcg_gq *blkg;
 130         int i;
 131
 132         /* alloc and init base part */
 133         blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
 134         if (!blkg)
 135                 return NULL;
 136
 137         if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
 138             blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
 139                 goto err_free;
 140
 141         blkg->q = q;
 142         INIT_LIST_HEAD(&blkg->q_node);
 143         blkg->blkcg = blkcg;
 144
 145         /* root blkg uses @q->root_rl, init rl only for !root blkgs */
 146         if (blkcg != &blkcg_root) {
 147                 if (blk_init_rl(&blkg->rl, q, gfp_mask))
 148                         goto err_free;
 149                 blkg->rl.blkg = blkg;
 150         }
 151
 152         for (i = 0; i < BLKCG_MAX_POLS; i++) {
 153                 struct blkcg_policy *pol = blkcg_policy[i];
 154                 struct blkg_policy_data *pd;
 155
 156                 if (!blkcg_policy_enabled(q, pol))
 157                         continue;
 158
 159                 /* alloc per-policy data and attach it to blkg */
 160                 pd = pol->pd_alloc_fn(gfp_mask, q->node);
 161                 if (!pd)
 162                         goto err_free;
 163
 164                 blkg->pd[i] = pd;
 165                 pd->blkg = blkg;
 166                 pd->plid = i;
 167         }
 168
 169         return blkg;
 170
 171 err_free:
 172         blkg_free(blkg);
 173         return NULL;
 174 }
 175
 176 struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 177                                       struct request_queue *q, bool update_hint)
 178 {
 179         struct blkcg_gq *blkg;
 180
 181         /*
 182          * Hint didn't match.  Look up from the radix tree.  Note that the
 183          * hint can only be updated under queue_lock as otherwise @blkg
 184          * could have already been removed from blkg_tree.  The caller is
 185          * responsible for grabbing queue_lock if @update_hint.
 186          */
 187         blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
 188         if (blkg && blkg->q == q) {
 189                 if (update_hint) {
 190                         lockdep_assert_held(q->queue_lock);
 191                         rcu_assign_pointer(blkcg->blkg_hint, blkg);
 192                 }
 193                 return blkg;
 194         }
 195
 196         return NULL;
 197 }
 198 EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
 199
 200 /*
 201  * If @new_blkg is %NULL, this function tries to allocate a new one as
 202  * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
 203  */
 204 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 205                                     struct request_queue *q,
 206                                     struct blkcg_gq *new_blkg)
 207 {
 208         struct blkcg_gq *blkg;
 209         struct bdi_writeback_congested *wb_congested;
 210         int i, ret;
 211
 212         WARN_ON_ONCE(!rcu_read_lock_held());
 213         lockdep_assert_held(q->queue_lock);
 214
 215         /* blkg holds a reference to blkcg */
 216         if (!css_tryget_online(&blkcg->css)) {
 217                 ret = -ENODEV;
 218                 goto err_free_blkg;
 219         }
 220
 221         wb_congested = wb_congested_get_create(q->backing_dev_info,
 222                                                blkcg->css.id,
 223                                                GFP_NOWAIT | __GFP_NOWARN);
 224         if (!wb_congested) {
 225                 ret = -ENOMEM;
 226                 goto err_put_css;
 227         }
 228
 229         /* allocate */
 230         if (!new_blkg) {
 231                 new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
 232                 if (unlikely(!new_blkg)) {
 233                         ret = -ENOMEM;
 234                         goto err_put_congested;
 235                 }
 236         }
 237         blkg = new_blkg;
 238         blkg->wb_congested = wb_congested;
 239
 240         /* link parent */
 241         if (blkcg_parent(blkcg)) {
 242                 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
 243                 if (WARN_ON_ONCE(!blkg->parent)) {
 244                         ret = -ENODEV;
 245                         goto err_put_congested;
 246                 }
 247                 blkg_get(blkg->parent);
 248         }
 249
 250         ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0,
 251                               GFP_NOWAIT | __GFP_NOWARN);
 252         if (ret)
 253                 goto err_cancel_ref;
 254
 255         /* invoke per-policy init */
 256         for (i = 0; i < BLKCG_MAX_POLS; i++) {
 257                 struct blkcg_policy *pol = blkcg_policy[i];
 258
 259                 if (blkg->pd[i] && pol->pd_init_fn)
 260                         pol->pd_init_fn(blkg->pd[i]);
 261         }
 262
 263         /* insert */
 264         spin_lock(&blkcg->lock);
 265         ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
 266         if (likely(!ret)) {
 267                 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 268                 list_add(&blkg->q_node, &q->blkg_list);
 269
 270                 for (i = 0; i < BLKCG_MAX_POLS; i++) {
 271                         struct blkcg_policy *pol = blkcg_policy[i];
 272
 273                         if (blkg->pd[i] && pol->pd_online_fn)
 274                                 pol->pd_online_fn(blkg->pd[i]);
 275                 }
 276         }
 277         blkg->online = true;
 278         spin_unlock(&blkcg->lock);
 279
 280         if (!ret)
 281                 return blkg;
 282
 283         /* @blkg failed fully initialized, use the usual release path */
 284         blkg_put(blkg);
 285         return ERR_PTR(ret);
 286
 287 err_cancel_ref:
 288         percpu_ref_exit(&blkg->refcnt);
 289 err_put_congested:
 290         wb_congested_put(wb_congested);
 291 err_put_css:
 292         css_put(&blkcg->css);
 293 err_free_blkg:
 294         blkg_free(new_blkg);
 295         return ERR_PTR(ret);
 296 }
 297
 298 /**
 299  * __blkg_lookup_create - lookup blkg, try to create one if not there
 300  * @blkcg: blkcg of interest
 301  * @q: request_queue of interest
 302  *
 303  * Lookup blkg for the @blkcg - @q pair.  If it doesn't exist, try to
 304  * create one.  blkg creation is performed recursively from blkcg_root such
 305  * that all non-root blkg's have access to the parent blkg.  This function
 306  * should be called under RCU read lock and @q->queue_lock.
 307  *
 308  * Returns the blkg or the closest blkg if blkg_create fails as it walks
 309  * down from root.
 310  */
 311 struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
 312                                       struct request_queue *q)
 313 {
 314         struct blkcg_gq *blkg;
 315
 316         WARN_ON_ONCE(!rcu_read_lock_held());
 317         lockdep_assert_held(q->queue_lock);
 318
 319         /*
 320          * This could be the first entry point of blkcg implementation and
 321          * we shouldn't allow anything to go through for a bypassing queue.
 322          */
 323         if (unlikely(blk_queue_bypass(q)))
 324                 return q->root_blkg;
 325
 326         blkg = __blkg_lookup(blkcg, q, true);
 327         if (blkg)
 328                 return blkg;
 329
 330         /*
 331          * Create blkgs walking down from blkcg_root to @blkcg, so that all
 332          * non-root blkgs have access to their parents.  Returns the closest
 333          * blkg to the intended blkg should blkg_create() fail.
 334          */
 335         while (true) {
 336                 struct blkcg *pos = blkcg;
 337                 struct blkcg *parent = blkcg_parent(blkcg);
 338                 struct blkcg_gq *ret_blkg = q->root_blkg;
 339
 340                 while (parent) {
 341                         blkg = __blkg_lookup(parent, q, false);
 342                         if (blkg) {
 343                                 /* remember closest blkg */
 344                                 ret_blkg = blkg;
 345                                 break;
 346                         }
 347                         pos = parent;
 348                         parent = blkcg_parent(parent);
 349                 }
 350
 351                 blkg = blkg_create(pos, q, NULL);
 352                 if (IS_ERR(blkg))
 353                         return ret_blkg;
 354                 if (pos == blkcg)
 355                         return blkg;
 356         }
 357 }
 358
 359 /**
 360  * blkg_lookup_create - find or create a blkg
 361  * @blkcg: target block cgroup
 362  * @q: target request_queue
 363  *
 364  * This looks up or creates the blkg representing the unique pair
 365  * of the blkcg and the request_queue.
 366  */
 367 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 368                                     struct request_queue *q)
 369 {
 370         struct blkcg_gq *blkg = blkg_lookup(blkcg, q);
 371         unsigned long flags;
 372
 373         if (unlikely(!blkg)) {
 374                 spin_lock_irqsave(q->queue_lock, flags);
 375
 376                 blkg = __blkg_lookup_create(blkcg, q);
 377
 378                 spin_unlock_irqrestore(q->queue_lock, flags);
 379         }
 380
 381         return blkg;
 382 }
 383
 384 static void blkg_destroy(struct blkcg_gq *blkg)
 385 {
 386         struct blkcg *blkcg = blkg->blkcg;
 387         struct blkcg_gq *parent = blkg->parent;
 388         int i;
 389
 390         lockdep_assert_held(blkg->q->queue_lock);
 391         lockdep_assert_held(&blkcg->lock);
 392
 393         /* Something wrong if we are trying to remove same group twice */
 394         WARN_ON_ONCE(list_empty(&blkg->q_node));
 395         WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
 396
 397         for (i = 0; i < BLKCG_MAX_POLS; i++) {
 398                 struct blkcg_policy *pol = blkcg_policy[i];
 399
 400                 if (blkg->pd[i] && pol->pd_offline_fn)
 401                         pol->pd_offline_fn(blkg->pd[i]);
 402         }
 403
 404         if (parent) {
 405                 blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
 406                 blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
 407         }
 408
 409         blkg->online = false;
 410
 411         radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
 412         list_del_init(&blkg->q_node);
 413         hlist_del_init_rcu(&blkg->blkcg_node);
 414
 415         /*
 416          * Both setting lookup hint to and clearing it from @blkg are done
 417          * under queue_lock.  If it's not pointing to @blkg now, it never
 418          * will.  Hint assignment itself can race safely.
 419          */
 420         if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
 421                 rcu_assign_pointer(blkcg->blkg_hint, NULL);
 422
 423         /*
 424          * Put the reference taken at the time of creation so that when all
 425          * queues are gone, group can be destroyed.
 426          */
 427         percpu_ref_kill(&blkg->refcnt);
 428 }
 429
 430 /**
 431  * blkg_destroy_all - destroy all blkgs associated with a request_queue
 432  * @q: request_queue of interest
 433  *
 434  * Destroy all blkgs associated with @q.
 435  */
 436 static void blkg_destroy_all(struct request_queue *q)
 437 {
 438         struct blkcg_gq *blkg, *n;
 439
 440         lockdep_assert_held(q->queue_lock);
 441
 442         list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
 443                 struct blkcg *blkcg = blkg->blkcg;
 444
 445                 spin_lock(&blkcg->lock);
 446                 blkg_destroy(blkg);
 447                 spin_unlock(&blkcg->lock);
 448         }
 449
 450         q->root_blkg = NULL;
 451         q->root_rl.blkg = NULL;
 452 }
 453
 454 /*
 455  * The next function used by blk_queue_for_each_rl().  It's a bit tricky
 456  * because the root blkg uses @q->root_rl instead of its own rl.
 457  */
 458 struct request_list *__blk_queue_next_rl(struct request_list *rl,
 459                                          struct request_queue *q)
 460 {
 461         struct list_head *ent;
 462         struct blkcg_gq *blkg;
 463
 464         /*
 465          * Determine the current blkg list_head.  The first entry is
 466          * root_rl which is off @q->blkg_list and mapped to the head.
 467          */
 468         if (rl == &q->root_rl) {
 469                 ent = &q->blkg_list;
 470                 /* There are no more block groups, hence no request lists */
 471                 if (list_empty(ent))
 472                         return NULL;
 473         } else {
 474                 blkg = container_of(rl, struct blkcg_gq, rl);
 475                 ent = &blkg->q_node;
 476         }
 477
 478         /* walk to the next list_head, skip root blkcg */
 479         ent = ent->next;
 480         if (ent == &q->root_blkg->q_node)
 481                 ent = ent->next;
 482         if (ent == &q->blkg_list)
 483                 return NULL;
 484
 485         blkg = container_of(ent, struct blkcg_gq, q_node);
 486         return &blkg->rl;
 487 }
 488
 489 static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 490                              struct cftype *cftype, u64 val)
 491 {
 492         struct blkcg *blkcg = css_to_blkcg(css);
 493         struct blkcg_gq *blkg;
 494         int i;
 495
 496         mutex_lock(&blkcg_pol_mutex);
 497         spin_lock_irq(&blkcg->lock);
 498
 499         /*
 500          * Note that stat reset is racy - it doesn't synchronize against
 501          * stat updates.  This is a debug feature which shouldn't exist
 502          * anyway.  If you get hit by a race, retry.
 503          */
 504         hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
 505                 blkg_rwstat_reset(&blkg->stat_bytes);
 506                 blkg_rwstat_reset(&blkg->stat_ios);
 507
 508                 for (i = 0; i < BLKCG_MAX_POLS; i++) {
 509                         struct blkcg_policy *pol = blkcg_policy[i];
 510
 511                         if (blkg->pd[i] && pol->pd_reset_stats_fn)
 512                                 pol->pd_reset_stats_fn(blkg->pd[i]);
 513                 }
 514         }
 515
 516         spin_unlock_irq(&blkcg->lock);
 517         mutex_unlock(&blkcg_pol_mutex);
 518         return 0;
 519 }
 520
 521 const char *blkg_dev_name(struct blkcg_gq *blkg)
 522 {
 523         /* some drivers (floppy) instantiate a queue w/o disk registered */
 524         if (blkg->q->backing_dev_info->dev)
 525                 return dev_name(blkg->q->backing_dev_info->dev);
 526         return NULL;
 527 }
 528 EXPORT_SYMBOL_GPL(blkg_dev_name);
 529
 530 /**
 531  * blkcg_print_blkgs - helper for printing per-blkg data
 532  * @sf: seq_file to print to
 533  * @blkcg: blkcg of interest
 534  * @prfill: fill function to print out a blkg
 535  * @pol: policy in question
 536  * @data: data to be passed to @prfill
 537  * @show_total: to print out sum of prfill return values or not
 538  *
 539  * This function invokes @prfill on each blkg of @blkcg if pd for the
 540  * policy specified by @pol exists.  @prfill is invoked with @sf, the
 541  * policy data and @data and the matching queue lock held.  If @show_total
 542  * is %true, the sum of the return values from @prfill is printed with
 543  * "Total" label at the end.
 544  *
 545  * This is to be used to construct print functions for
 546  * cftype->read_seq_string method.
 547  */
 548 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 549                        u64 (*prfill)(struct seq_file *,
 550                                      struct blkg_policy_data *, int),
 551                        const struct blkcg_policy *pol, int data,
 552                        bool show_total)
 553 {
 554         struct blkcg_gq *blkg;
 555         u64 total = 0;
 556
 557         rcu_read_lock();
 558         hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
 559                 spin_lock_irq(blkg->q->queue_lock);
 560                 if (blkcg_policy_enabled(blkg->q, pol))
 561                         total += prfill(sf, blkg->pd[pol->plid], data);
 562                 spin_unlock_irq(blkg->q->queue_lock);
 563         }
 564         rcu_read_unlock();
 565
 566         if (show_total)
 567                 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
 568 }
 569 EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
 570
 571 /**
 572  * __blkg_prfill_u64 - prfill helper for a single u64 value
 573  * @sf: seq_file to print to
 574  * @pd: policy private data of interest
 575  * @v: value to print
 576  *
 577  * Print @v to @sf for the device assocaited with @pd.
 578  */
 579 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
 580 {
 581         const char *dname = blkg_dev_name(pd->blkg);
 582
 583         if (!dname)
 584                 return 0;
 585
 586         seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
 587         return v;
 588 }
 589 EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
 590
 591 /**
 592  * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
 593  * @sf: seq_file to print to
 594  * @pd: policy private data of interest
 595  * @rwstat: rwstat to print
 596  *
 597  * Print @rwstat to @sf for the device assocaited with @pd.
 598  */
 599 u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 600                          const struct blkg_rwstat *rwstat)
 601 {
 602         static const char *rwstr[] = {
 603                 [BLKG_RWSTAT_READ]      = "Read",
 604                 [BLKG_RWSTAT_WRITE]     = "Write",
 605                 [BLKG_RWSTAT_SYNC]      = "Sync",
 606                 [BLKG_RWSTAT_ASYNC]     = "Async",
 607                 [BLKG_RWSTAT_DISCARD]   = "Discard",
 608         };
 609         const char *dname = blkg_dev_name(pd->blkg);
 610         u64 v;
 611         int i;
 612
 613         if (!dname)
 614                 return 0;
 615
 616         for (i = 0; i < BLKG_RWSTAT_NR; i++)
 617                 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
 618                            (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
 619
 620         v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
 621                 atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
 622                 atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
 623         seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
 624         return v;
 625 }
 626 EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
 627
 628 /**
 629  * blkg_prfill_stat - prfill callback for blkg_stat
 630  * @sf: seq_file to print to
 631  * @pd: policy private data of interest
 632  * @off: offset to the blkg_stat in @pd
 633  *
 634  * prfill callback for printing a blkg_stat.
 635  */
 636 u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
 637 {
 638         return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
 639 }
 640 EXPORT_SYMBOL_GPL(blkg_prfill_stat);
 641
 642 /**
 643  * blkg_prfill_rwstat - prfill callback for blkg_rwstat
 644  * @sf: seq_file to print to
 645  * @pd: policy private data of interest
 646  * @off: offset to the blkg_rwstat in @pd
 647  *
 648  * prfill callback for printing a blkg_rwstat.
 649  */
 650 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 651                        int off)
 652 {
 653         struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
 654
 655         return __blkg_prfill_rwstat(sf, pd, &rwstat);
 656 }
 657 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 658
 659 static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
 660                                     struct blkg_policy_data *pd, int off)
 661 {
 662         struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
 663
 664         return __blkg_prfill_rwstat(sf, pd, &rwstat);
 665 }
 666
 667 /**
 668  * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
 669  * @sf: seq_file to print to
 670  * @v: unused
 671  *
 672  * To be used as cftype->seq_show to print blkg->stat_bytes.
 673  * cftype->private must be set to the blkcg_policy.
 674  */
 675 int blkg_print_stat_bytes(struct seq_file *sf, void *v)
 676 {
 677         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
 678                           blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
 679                           offsetof(struct blkcg_gq, stat_bytes), true);
 680         return 0;
 681 }
 682 EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
 683
 684 /**
 685  * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
 686  * @sf: seq_file to print to
 687  * @v: unused
 688  *
 689  * To be used as cftype->seq_show to print blkg->stat_ios.  cftype->private
 690  * must be set to the blkcg_policy.
 691  */
 692 int blkg_print_stat_ios(struct seq_file *sf, void *v)
 693 {
 694         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
 695                           blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
 696                           offsetof(struct blkcg_gq, stat_ios), true);
 697         return 0;
 698 }
 699 EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
 700
 701 static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
 702                                               struct blkg_policy_data *pd,
 703                                               int off)
 704 {
 705         struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
 706                                                               NULL, off);
 707         return __blkg_prfill_rwstat(sf, pd, &rwstat);
 708 }
 709
 710 /**
 711  * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
 712  * @sf: seq_file to print to
 713  * @v: unused
 714  */
 715 int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
 716 {
 717         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
 718                           blkg_prfill_rwstat_field_recursive,
 719                           (void *)seq_cft(sf)->private,
 720                           offsetof(struct blkcg_gq, stat_bytes), true);
 721         return 0;
 722 }
 723 EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
 724
 725 /**
 726  * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
 727  * @sf: seq_file to print to
 728  * @v: unused
 729  */
 730 int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
 731 {
 732         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
 733                           blkg_prfill_rwstat_field_recursive,
 734                           (void *)seq_cft(sf)->private,
 735                           offsetof(struct blkcg_gq, stat_ios), true);
 736         return 0;
 737 }
 738 EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
 739
 740 /**
 741  * blkg_stat_recursive_sum - collect hierarchical blkg_stat
 742  * @blkg: blkg of interest
 743  * @pol: blkcg_policy which contains the blkg_stat
 744  * @off: offset to the blkg_stat in blkg_policy_data or @blkg
 745  *
 746  * Collect the blkg_stat specified by @blkg, @pol and @off and all its
 747  * online descendants and their aux counts.  The caller must be holding the
 748  * queue lock for online tests.
 749  *
 750  * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
 751  * at @off bytes into @blkg's blkg_policy_data of the policy.
 752  */
 753 u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
 754                             struct blkcg_policy *pol, int off)
 755 {
 756         struct blkcg_gq *pos_blkg;
 757         struct cgroup_subsys_state *pos_css;
 758         u64 sum = 0;
 759
 760         lockdep_assert_held(blkg->q->queue_lock);
 761
 762         rcu_read_lock();
 763         blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
 764                 struct blkg_stat *stat;
 765
 766                 if (!pos_blkg->online)
 767                         continue;
 768
 769                 if (pol)
 770                         stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
 771                 else
 772                         stat = (void *)blkg + off;
 773
 774                 sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
 775         }
 776         rcu_read_unlock();
 777
 778         return sum;
 779 }
 780 EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
 781
 782 /**
 783  * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
 784  * @blkg: blkg of interest
 785  * @pol: blkcg_policy which contains the blkg_rwstat
 786  * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
 787  *
 788  * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
 789  * online descendants and their aux counts.  The caller must be holding the
 790  * queue lock for online tests.
 791  *
 792  * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
 793  * is at @off bytes into @blkg's blkg_policy_data of the policy.
 794  */
 795 struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
 796                                              struct blkcg_policy *pol, int off)
 797 {
 798         struct blkcg_gq *pos_blkg;
 799         struct cgroup_subsys_state *pos_css;
 800         struct blkg_rwstat sum = { };
 801         int i;
 802
 803         lockdep_assert_held(blkg->q->queue_lock);
 804
 805         rcu_read_lock();
 806         blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
 807                 struct blkg_rwstat *rwstat;
 808
 809                 if (!pos_blkg->online)
 810                         continue;
 811
 812                 if (pol)
 813                         rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
 814                 else
 815                         rwstat = (void *)pos_blkg + off;
 816
 817                 for (i = 0; i < BLKG_RWSTAT_NR; i++)
 818                         atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
 819                                 percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
 820                                 &sum.aux_cnt[i]);
 821         }
 822         rcu_read_unlock();
 823
 824         return sum;
 825 }
 826 EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
 827
 828 /* Performs queue bypass and policy enabled checks then looks up blkg. */
 829 static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
 830                                           const struct blkcg_policy *pol,
 831                                           struct request_queue *q)
 832 {
 833         WARN_ON_ONCE(!rcu_read_lock_held());
 834         lockdep_assert_held(q->queue_lock);
 835
 836         if (!blkcg_policy_enabled(q, pol))
 837                 return ERR_PTR(-EOPNOTSUPP);
 838
 839         /*
 840          * This could be the first entry point of blkcg implementation and
 841          * we shouldn't allow anything to go through for a bypassing queue.
 842          */
 843         if (unlikely(blk_queue_bypass(q)))
 844                 return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
 845
 846         return __blkg_lookup(blkcg, q, true /* update_hint */);
 847 }
 848
 849 /**
 850  * blkg_conf_prep - parse and prepare for per-blkg config update
 851  * @blkcg: target block cgroup
 852  * @pol: target policy
 853  * @input: input string
 854  * @ctx: blkg_conf_ctx to be filled
 855  *
 856  * Parse per-blkg config update from @input and initialize @ctx with the
 857  * result.  @ctx->blkg points to the blkg to be updated and @ctx->body the
 858  * part of @input following MAJ:MIN.  This function returns with RCU read
 859  * lock and queue lock held and must be paired with blkg_conf_finish().
 860  */
 861 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 862                    char *input, struct blkg_conf_ctx *ctx)
 863         __acquires(rcu) __acquires(disk->queue->queue_lock)
 864 {
 865         struct gendisk *disk;
 866         struct request_queue *q;
 867         struct blkcg_gq *blkg;
 868         unsigned int major, minor;
 869         int key_len, part, ret;
 870         char *body;
 871
 872         if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
 873                 return -EINVAL;
 874
 875         body = input + key_len;
 876         if (!isspace(*body))
 877                 return -EINVAL;
 878         body = skip_spaces(body);
 879
 880         disk = get_gendisk(MKDEV(major, minor), &part);
 881         if (!disk)
 882                 return -ENODEV;
 883         if (part) {
 884                 ret = -ENODEV;
 885                 goto fail;
 886         }
 887
 888         q = disk->queue;
 889
 890         rcu_read_lock();
 891         spin_lock_irq(q->queue_lock);
 892
 893         blkg = blkg_lookup_check(blkcg, pol, q);
 894         if (IS_ERR(blkg)) {
 895                 ret = PTR_ERR(blkg);
 896                 goto fail_unlock;
 897         }
 898
 899         if (blkg)
 900                 goto success;
 901
 902         /*
 903          * Create blkgs walking down from blkcg_root to @blkcg, so that all
 904          * non-root blkgs have access to their parents.
 905          */
 906         while (true) {
 907                 struct blkcg *pos = blkcg;
 908                 struct blkcg *parent;
 909                 struct blkcg_gq *new_blkg;
 910
 911                 parent = blkcg_parent(blkcg);
 912                 while (parent && !__blkg_lookup(parent, q, false)) {
 913                         pos = parent;
 914                         parent = blkcg_parent(parent);
 915                 }
 916
 917                 /* Drop locks to do new blkg allocation with GFP_KERNEL. */
 918                 spin_unlock_irq(q->queue_lock);
 919                 rcu_read_unlock();
 920
 921                 new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
 922                 if (unlikely(!new_blkg)) {
 923                         ret = -ENOMEM;
 924                         goto fail;
 925                 }
 926
 927                 rcu_read_lock();
 928                 spin_lock_irq(q->queue_lock);
 929
 930                 blkg = blkg_lookup_check(pos, pol, q);
 931                 if (IS_ERR(blkg)) {
 932                         ret = PTR_ERR(blkg);
 933                         goto fail_unlock;
 934                 }
 935
 936                 if (blkg) {
 937                         blkg_free(new_blkg);
 938                 } else {
 939                         blkg = blkg_create(pos, q, new_blkg);
 940                         if (unlikely(IS_ERR(blkg))) {
 941                                 ret = PTR_ERR(blkg);
 942                                 goto fail_unlock;
 943                         }
 944                 }
 945
 946                 if (pos == blkcg)
 947                         goto success;
 948         }
 949 success:
 950         ctx->disk = disk;
 951         ctx->blkg = blkg;
 952         ctx->body = body;
 953         return 0;
 954
 955 fail_unlock:
 956         spin_unlock_irq(q->queue_lock);
 957         rcu_read_unlock();
 958 fail:
 959         put_disk_and_module(disk);
 960         /*
 961          * If queue was bypassing, we should retry.  Do so after a
 962          * short msleep().  It isn't strictly necessary but queue
 963          * can be bypassing for some time and it's always nice to
 964          * avoid busy looping.
 965          */
 966         if (ret == -EBUSY) {
 967                 msleep(10);
 968                 ret = restart_syscall();
 969         }
 970         return ret;
 971 }
 972 EXPORT_SYMBOL_GPL(blkg_conf_prep);
 973
 974 /**
 975  * blkg_conf_finish - finish up per-blkg config update
 976  * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
 977  *
 978  * Finish up after per-blkg config update.  This function must be paired
 979  * with blkg_conf_prep().
 980  */
 981 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 982         __releases(ctx->disk->queue->queue_lock) __releases(rcu)
 983 {
 984         spin_unlock_irq(ctx->disk->queue->queue_lock);
 985         rcu_read_unlock();
 986         put_disk_and_module(ctx->disk);
 987 }
 988 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 989
 990 static int blkcg_print_stat(struct seq_file *sf, void *v)
 991 {
 992         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
 993         struct blkcg_gq *blkg;
 994
 995         rcu_read_lock();
 996
 997         hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
 998                 const char *dname;
 999                 char *buf;
1000                 struct blkg_rwstat rwstat;
1001                 u64 rbytes, wbytes, rios, wios, dbytes, dios;
1002                 size_t size = seq_get_buf(sf, &buf), off = 0;
1003                 int i;
1004                 bool has_stats = false;
1005
1006                 dname = blkg_dev_name(blkg);
1007                 if (!dname)
1008                         continue;
1009
1010                 /*
1011                  * Hooray string manipulation, count is the size written NOT
1012                  * INCLUDING THE \0, so size is now count+1 less than what we
1013                  * had before, but we want to start writing the next bit from
1014                  * the \0 so we only add count to buf.
1015                  */
1016                 off += scnprintf(buf+off, size-off, "%s ", dname);
1017
1018                 spin_lock_irq(blkg->q->queue_lock);
1019
1020                 rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
1021                                         offsetof(struct blkcg_gq, stat_bytes));
1022                 rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
1023                 wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
1024                 dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
1025
1026                 rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
1027                                         offsetof(struct blkcg_gq, stat_ios));
1028                 rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
1029                 wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
1030                 dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
1031
1032                 spin_unlock_irq(blkg->q->queue_lock);
1033
1034                 if (rbytes || wbytes || rios || wios) {
1035                         has_stats = true;
1036                         off += scnprintf(buf+off, size-off,
1037                                          "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
1038                                          rbytes, wbytes, rios, wios,
1039                                          dbytes, dios);
1040                 }
1041
1042                 if (!blkcg_debug_stats)
1043                         goto next;
1044
1045                 if (atomic_read(&blkg->use_delay)) {
1046                         has_stats = true;
1047                         off += scnprintf(buf+off, size-off,
1048                                          " use_delay=%d delay_nsec=%llu",
1049                                          atomic_read(&blkg->use_delay),
1050                                         (unsigned long long)atomic64_read(&blkg->delay_nsec));
1051                 }
1052
1053                 for (i = 0; i < BLKCG_MAX_POLS; i++) {
1054                         struct blkcg_policy *pol = blkcg_policy[i];
1055                         size_t written;
1056
1057                         if (!blkg->pd[i] || !pol->pd_stat_fn)
1058                                 continue;
1059
1060                         written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
1061                         if (written)
1062                                 has_stats = true;
1063                         off += written;
1064                 }
1065 next:
1066                 if (has_stats) {
1067                         off += scnprintf(buf+off, size-off, "\n");
1068                         seq_commit(sf, off);
1069                 }
1070         }
1071
1072         rcu_read_unlock();
1073         return 0;
1074 }
1075
1076 static struct cftype blkcg_files[] = {
1077         {
1078                 .name = "stat",
1079                 .flags = CFTYPE_NOT_ON_ROOT,
1080                 .seq_show = blkcg_print_stat,
1081         },
1082         { }     /* terminate */
1083 };
1084
1085 static struct cftype blkcg_legacy_files[] = {
1086         {
1087                 .name = "reset_stats",
1088                 .write_u64 = blkcg_reset_stats,
1089         },
1090         { }     /* terminate */
1091 };
1092
1093 /*
1094  * blkcg destruction is a three-stage process.
1095  *
1096  * 1. Destruction starts.  The blkcg_css_offline() callback is invoked
1097  *    which offlines writeback.  Here we tie the next stage of blkg destruction
1098  *    to the completion of writeback associated with the blkcg.  This lets us
1099  *    avoid punting potentially large amounts of outstanding writeback to root
1100  *    while maintaining any ongoing policies.  The next stage is triggered when
1101  *    the nr_cgwbs count goes to zero.
1102  *
1103  * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called
1104  *    and handles the destruction of blkgs.  Here the css reference held by
1105  *    the blkg is put back eventually allowing blkcg_css_free() to be called.
1106  *    This work may occur in cgwb_release_workfn() on the cgwb_release
1107  *    workqueue.  Any submitted ios that fail to get the blkg ref will be
1108  *    punted to the root_blkg.
1109  *
1110  * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called.
1111  *    This finally frees the blkcg.
1112  */
1113
1114 /**
1115  * blkcg_css_offline - cgroup css_offline callback
1116  * @css: css of interest
1117  *
1118  * This function is called when @css is about to go away.  Here the cgwbs are
1119  * offlined first and only once writeback associated with the blkcg has
1120  * finished do we start step 2 (see above).
1121  */
1122 static void blkcg_css_offline(struct cgroup_subsys_state *css)
1123 {
1124         struct blkcg *blkcg = css_to_blkcg(css);
1125
1126         /* this prevents anyone from attaching or migrating to this blkcg */
1127         wb_blkcg_offline(blkcg);
1128
1129         /* put the base cgwb reference allowing step 2 to be triggered */
1130         blkcg_cgwb_put(blkcg);
1131 }
1132
1133 /**
1134  * blkcg_destroy_blkgs - responsible for shooting down blkgs
1135  * @blkcg: blkcg of interest
1136  *
1137  * blkgs should be removed while holding both q and blkcg locks.  As blkcg lock
1138  * is nested inside q lock, this function performs reverse double lock dancing.
1139  * Destroying the blkgs releases the reference held on the blkcg's css allowing
1140  * blkcg_css_free to eventually be called.
1141  *
1142  * This is the blkcg counterpart of ioc_release_fn().
1143  */
1144 void blkcg_destroy_blkgs(struct blkcg *blkcg)
1145 {
1146         spin_lock_irq(&blkcg->lock);
1147
1148         while (!hlist_empty(&blkcg->blkg_list)) {
1149                 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
1150                                                 struct blkcg_gq, blkcg_node);
1151                 struct request_queue *q = blkg->q;
1152
1153                 if (spin_trylock(q->queue_lock)) {
1154                         blkg_destroy(blkg);
1155                         spin_unlock(q->queue_lock);
1156                 } else {
1157                         spin_unlock_irq(&blkcg->lock);
1158                         cpu_relax();
1159                         spin_lock_irq(&blkcg->lock);
1160                 }
1161         }
1162
1163         spin_unlock_irq(&blkcg->lock);
1164 }
1165
1166 static void blkcg_css_free(struct cgroup_subsys_state *css)
1167 {
1168         struct blkcg *blkcg = css_to_blkcg(css);
1169         int i;
1170
1171         mutex_lock(&blkcg_pol_mutex);
1172
1173         list_del(&blkcg->all_blkcgs_node);
1174
1175         for (i = 0; i < BLKCG_MAX_POLS; i++)
1176                 if (blkcg->cpd[i])
1177                         blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1178
1179         mutex_unlock(&blkcg_pol_mutex);
1180
1181         kfree(blkcg);
1182 }
1183
1184 static struct cgroup_subsys_state *
1185 blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
1186 {
1187         struct blkcg *blkcg;
1188         struct cgroup_subsys_state *ret;
1189         int i;
1190
1191         mutex_lock(&blkcg_pol_mutex);
1192
1193         if (!parent_css) {
1194                 blkcg = &blkcg_root;
1195         } else {
1196                 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1197                 if (!blkcg) {
1198                         ret = ERR_PTR(-ENOMEM);
1199                         goto unlock;
1200                 }
1201         }
1202
1203         for (i = 0; i < BLKCG_MAX_POLS ; i++) {
1204                 struct blkcg_policy *pol = blkcg_policy[i];
1205                 struct blkcg_policy_data *cpd;
1206
1207                 /*
1208                  * If the policy hasn't been attached yet, wait for it
1209                  * to be attached before doing anything else. Otherwise,
1210                  * check if the policy requires any specific per-cgroup
1211                  * data: if it does, allocate and initialize it.
1212                  */
1213                 if (!pol || !pol->cpd_alloc_fn)
1214                         continue;
1215
1216                 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1217                 if (!cpd) {
1218                         ret = ERR_PTR(-ENOMEM);
1219                         goto free_pd_blkcg;
1220                 }
1221                 blkcg->cpd[i] = cpd;
1222                 cpd->blkcg = blkcg;
1223                 cpd->plid = i;
1224                 if (pol->cpd_init_fn)
1225                         pol->cpd_init_fn(cpd);
1226         }
1227
1228         spin_lock_init(&blkcg->lock);
1229         INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
1230         INIT_HLIST_HEAD(&blkcg->blkg_list);
1231 #ifdef CONFIG_CGROUP_WRITEBACK
1232         INIT_LIST_HEAD(&blkcg->cgwb_list);
1233         refcount_set(&blkcg->cgwb_refcnt, 1);
1234 #endif
1235         list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
1236
1237         mutex_unlock(&blkcg_pol_mutex);
1238         return &blkcg->css;
1239
1240 free_pd_blkcg:
1241         for (i--; i >= 0; i--)
1242                 if (blkcg->cpd[i])
1243                         blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1244
1245         if (blkcg != &blkcg_root)
1246                 kfree(blkcg);
1247 unlock:
1248         mutex_unlock(&blkcg_pol_mutex);
1249         return ret;
1250 }
1251
1252 /**
1253  * blkcg_init_queue - initialize blkcg part of request queue
1254  * @q: request_queue to initialize
1255  *
1256  * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
1257  * part of new request_queue @q.
1258  *
1259  * RETURNS:
1260  * 0 on success, -errno on failure.
1261  */
1262 int blkcg_init_queue(struct request_queue *q)
1263 {
1264         struct blkcg_gq *new_blkg, *blkg;
1265         bool preloaded;
1266         int ret;
1267
1268         new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
1269         if (!new_blkg)
1270                 return -ENOMEM;
1271
1272         preloaded = !radix_tree_preload(GFP_KERNEL);
1273
1274         /* Make sure the root blkg exists. */
1275         rcu_read_lock();
1276         spin_lock_irq(q->queue_lock);
1277         blkg = blkg_create(&blkcg_root, q, new_blkg);
1278         if (IS_ERR(blkg))
1279                 goto err_unlock;
1280         q->root_blkg = blkg;
1281         q->root_rl.blkg = blkg;
1282         spin_unlock_irq(q->queue_lock);
1283         rcu_read_unlock();
1284
1285         if (preloaded)
1286                 radix_tree_preload_end();
1287
1288         ret = blk_iolatency_init(q);
1289         if (ret) {
1290                 spin_lock_irq(q->queue_lock);
1291                 blkg_destroy_all(q);
1292                 spin_unlock_irq(q->queue_lock);
1293                 return ret;
1294         }
1295
1296         ret = blk_throtl_init(q);
1297         if (ret) {
1298                 spin_lock_irq(q->queue_lock);
1299                 blkg_destroy_all(q);
1300                 spin_unlock_irq(q->queue_lock);
1301         }
1302         return ret;
1303
1304 err_unlock:
1305         spin_unlock_irq(q->queue_lock);
1306         rcu_read_unlock();
1307         if (preloaded)
1308                 radix_tree_preload_end();
1309         return PTR_ERR(blkg);
1310 }
1311
1312 /**
1313  * blkcg_drain_queue - drain blkcg part of request_queue
1314  * @q: request_queue to drain
1315  *
1316  * Called from blk_drain_queue().  Responsible for draining blkcg part.
1317  */
1318 void blkcg_drain_queue(struct request_queue *q)
1319 {
1320         lockdep_assert_held(q->queue_lock);
1321
1322         /*
1323          * @q could be exiting and already have destroyed all blkgs as
1324          * indicated by NULL root_blkg.  If so, don't confuse policies.
1325          */
1326         if (!q->root_blkg)
1327                 return;
1328
1329         blk_throtl_drain(q);
1330 }
1331
1332 /**
1333  * blkcg_exit_queue - exit and release blkcg part of request_queue
1334  * @q: request_queue being released
1335  *
1336  * Called from blk_release_queue().  Responsible for exiting blkcg part.
1337  */
1338 void blkcg_exit_queue(struct request_queue *q)
1339 {
1340         spin_lock_irq(q->queue_lock);
1341         blkg_destroy_all(q);
1342         spin_unlock_irq(q->queue_lock);
1343
1344         blk_throtl_exit(q);
1345 }
1346
1347 /*
1348  * We cannot support shared io contexts, as we have no mean to support
1349  * two tasks with the same ioc in two different groups without major rework
1350  * of the main cic data structures.  For now we allow a task to change
1351  * its cgroup only if it's the only owner of its ioc.
1352  */
1353 static int blkcg_can_attach(struct cgroup_taskset *tset)
1354 {
1355         struct task_struct *task;
1356         struct cgroup_subsys_state *dst_css;
1357         struct io_context *ioc;
1358         int ret = 0;
1359
1360         /* task_lock() is needed to avoid races with exit_io_context() */
1361         cgroup_taskset_for_each(task, dst_css, tset) {
1362                 task_lock(task);
1363                 ioc = task->io_context;
1364                 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1365                         ret = -EINVAL;
1366                 task_unlock(task);
1367                 if (ret)
1368                         break;
1369         }
1370         return ret;
1371 }
1372
1373 static void blkcg_bind(struct cgroup_subsys_state *root_css)
1374 {
1375         int i;
1376
1377         mutex_lock(&blkcg_pol_mutex);
1378
1379         for (i = 0; i < BLKCG_MAX_POLS; i++) {
1380                 struct blkcg_policy *pol = blkcg_policy[i];
1381                 struct blkcg *blkcg;
1382
1383                 if (!pol || !pol->cpd_bind_fn)
1384                         continue;
1385
1386                 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
1387                         if (blkcg->cpd[pol->plid])
1388                                 pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
1389         }
1390         mutex_unlock(&blkcg_pol_mutex);
1391 }
1392
1393 static void blkcg_exit(struct task_struct *tsk)
1394 {
1395         if (tsk->throttle_queue)
1396                 blk_put_queue(tsk->throttle_queue);
1397         tsk->throttle_queue = NULL;
1398 }
1399
1400 struct cgroup_subsys io_cgrp_subsys = {
1401         .css_alloc = blkcg_css_alloc,
1402         .css_offline = blkcg_css_offline,
1403         .css_free = blkcg_css_free,
1404         .can_attach = blkcg_can_attach,
1405         .bind = blkcg_bind,
1406         .dfl_cftypes = blkcg_files,
1407         .legacy_cftypes = blkcg_legacy_files,
1408         .legacy_name = "blkio",
1409         .exit = blkcg_exit,
1410 #ifdef CONFIG_MEMCG
1411         /*
1412          * This ensures that, if available, memcg is automatically enabled
1413          * together on the default hierarchy so that the owner cgroup can
1414          * be retrieved from writeback pages.
1415          */
1416         .depends_on = 1 << memory_cgrp_id,
1417 #endif
1418 };
1419 EXPORT_SYMBOL_GPL(io_cgrp_subsys);
1420
1421 /**
1422  * blkcg_activate_policy - activate a blkcg policy on a request_queue
1423  * @q: request_queue of interest
1424  * @pol: blkcg policy to activate
1425  *
1426  * Activate @pol on @q.  Requires %GFP_KERNEL context.  @q goes through
1427  * bypass mode to populate its blkgs with policy_data for @pol.
1428  *
1429  * Activation happens with @q bypassed, so nobody would be accessing blkgs
1430  * from IO path.  Update of each blkg is protected by both queue and blkcg
1431  * locks so that holding either lock and testing blkcg_policy_enabled() is
1432  * always enough for dereferencing policy data.
1433  *
1434  * The caller is responsible for synchronizing [de]activations and policy
1435  * [un]registerations.  Returns 0 on success, -errno on failure.
1436  */
1437 int blkcg_activate_policy(struct request_queue *q,
1438                           const struct blkcg_policy *pol)
1439 {
1440         struct blkg_policy_data *pd_prealloc = NULL;
1441         struct blkcg_gq *blkg;
1442         int ret;
1443
1444         if (blkcg_policy_enabled(q, pol))
1445                 return 0;
1446
1447         if (q->mq_ops)
1448                 blk_mq_freeze_queue(q);
1449         else
1450                 blk_queue_bypass_start(q);
1451 pd_prealloc:
1452         if (!pd_prealloc) {
1453                 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
1454                 if (!pd_prealloc) {
1455                         ret = -ENOMEM;
1456                         goto out_bypass_end;
1457                 }
1458         }
1459
1460         spin_lock_irq(q->queue_lock);
1461
1462         list_for_each_entry(blkg, &q->blkg_list, q_node) {
1463                 struct blkg_policy_data *pd;
1464
1465                 if (blkg->pd[pol->plid])
1466                         continue;
1467
1468                 pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node);
1469                 if (!pd)
1470                         swap(pd, pd_prealloc);
1471                 if (!pd) {
1472                         spin_unlock_irq(q->queue_lock);
1473                         goto pd_prealloc;
1474                 }
1475
1476                 blkg->pd[pol->plid] = pd;
1477                 pd->blkg = blkg;
1478                 pd->plid = pol->plid;
1479                 if (pol->pd_init_fn)
1480                         pol->pd_init_fn(pd);
1481         }
1482
1483         __set_bit(pol->plid, q->blkcg_pols);
1484         ret = 0;
1485
1486         spin_unlock_irq(q->queue_lock);
1487 out_bypass_end:
1488         if (q->mq_ops)
1489                 blk_mq_unfreeze_queue(q);
1490         else
1491                 blk_queue_bypass_end(q);
1492         if (pd_prealloc)
1493                 pol->pd_free_fn(pd_prealloc);
1494         return ret;
1495 }
1496 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1497
1498 /**
1499  * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
1500  * @q: request_queue of interest
1501  * @pol: blkcg policy to deactivate
1502  *
1503  * Deactivate @pol on @q.  Follows the same synchronization rules as
1504  * blkcg_activate_policy().
1505  */
1506 void blkcg_deactivate_policy(struct request_queue *q,
1507                              const struct blkcg_policy *pol)
1508 {
1509         struct blkcg_gq *blkg;
1510
1511         if (!blkcg_policy_enabled(q, pol))
1512                 return;
1513
1514         if (q->mq_ops)
1515                 blk_mq_freeze_queue(q);
1516         else
1517                 blk_queue_bypass_start(q);
1518
1519         spin_lock_irq(q->queue_lock);
1520
1521         __clear_bit(pol->plid, q->blkcg_pols);
1522
1523         list_for_each_entry(blkg, &q->blkg_list, q_node) {
1524                 if (blkg->pd[pol->plid]) {
1525                         if (pol->pd_offline_fn)
1526                                 pol->pd_offline_fn(blkg->pd[pol->plid]);
1527                         pol->pd_free_fn(blkg->pd[pol->plid]);
1528                         blkg->pd[pol->plid] = NULL;
1529                 }
1530         }
1531
1532         spin_unlock_irq(q->queue_lock);
1533
1534         if (q->mq_ops)
1535                 blk_mq_unfreeze_queue(q);
1536         else
1537                 blk_queue_bypass_end(q);
1538 }
1539 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1540
1541 /**
1542  * blkcg_policy_register - register a blkcg policy
1543  * @pol: blkcg policy to register
1544  *
1545  * Register @pol with blkcg core.  Might sleep and @pol may be modified on
1546  * successful registration.  Returns 0 on success and -errno on failure.
1547  */
1548 int blkcg_policy_register(struct blkcg_policy *pol)
1549 {
1550         struct blkcg *blkcg;
1551         int i, ret;
1552
1553         mutex_lock(&blkcg_pol_register_mutex);
1554         mutex_lock(&blkcg_pol_mutex);
1555
1556         /* find an empty slot */
1557         ret = -ENOSPC;
1558         for (i = 0; i < BLKCG_MAX_POLS; i++)
1559                 if (!blkcg_policy[i])
1560                         break;
1561         if (i >= BLKCG_MAX_POLS) {
1562                 pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
1563                 goto err_unlock;
1564         }
1565
1566         /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
1567         if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
1568                 (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
1569                 goto err_unlock;
1570
1571         /* register @pol */
1572         pol->plid = i;
1573         blkcg_policy[pol->plid] = pol;
1574
1575         /* allocate and install cpd's */
1576         if (pol->cpd_alloc_fn) {
1577                 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1578                         struct blkcg_policy_data *cpd;
1579
1580                         cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1581                         if (!cpd)
1582                                 goto err_free_cpds;
1583
1584                         blkcg->cpd[pol->plid] = cpd;
1585                         cpd->blkcg = blkcg;
1586                         cpd->plid = pol->plid;
1587                         pol->cpd_init_fn(cpd);
1588                 }
1589         }
1590
1591         mutex_unlock(&blkcg_pol_mutex);
1592
1593         /* everything is in place, add intf files for the new policy */
1594         if (pol->dfl_cftypes)
1595                 WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
1596                                                pol->dfl_cftypes));
1597         if (pol->legacy_cftypes)
1598                 WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
1599                                                   pol->legacy_cftypes));
1600         mutex_unlock(&blkcg_pol_register_mutex);
1601         return 0;
1602
1603 err_free_cpds:
1604         if (pol->cpd_free_fn) {
1605                 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1606                         if (blkcg->cpd[pol->plid]) {
1607                                 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1608                                 blkcg->cpd[pol->plid] = NULL;
1609                         }
1610                 }
1611         }
1612         blkcg_policy[pol->plid] = NULL;
1613 err_unlock:
1614         mutex_unlock(&blkcg_pol_mutex);
1615         mutex_unlock(&blkcg_pol_register_mutex);
1616         return ret;
1617 }
1618 EXPORT_SYMBOL_GPL(blkcg_policy_register);
1619
1620 /**
1621  * blkcg_policy_unregister - unregister a blkcg policy
1622  * @pol: blkcg policy to unregister
1623  *
1624  * Undo blkcg_policy_register(@pol).  Might sleep.
1625  */
1626 void blkcg_policy_unregister(struct blkcg_policy *pol)
1627 {
1628         struct blkcg *blkcg;
1629
1630         mutex_lock(&blkcg_pol_register_mutex);
1631
1632         if (WARN_ON(blkcg_policy[pol->plid] != pol))
1633                 goto out_unlock;
1634
1635         /* kill the intf files first */
1636         if (pol->dfl_cftypes)
1637                 cgroup_rm_cftypes(pol->dfl_cftypes);
1638         if (pol->legacy_cftypes)
1639                 cgroup_rm_cftypes(pol->legacy_cftypes);
1640
1641         /* remove cpds and unregister */
1642         mutex_lock(&blkcg_pol_mutex);
1643
1644         if (pol->cpd_free_fn) {
1645                 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1646                         if (blkcg->cpd[pol->plid]) {
1647                                 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1648                                 blkcg->cpd[pol->plid] = NULL;
1649                         }
1650                 }
1651         }
1652         blkcg_policy[pol->plid] = NULL;
1653
1654         mutex_unlock(&blkcg_pol_mutex);
1655 out_unlock:
1656         mutex_unlock(&blkcg_pol_register_mutex);
1657 }
1658 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1659
1660 /*
1661  * Scale the accumulated delay based on how long it has been since we updated
1662  * the delay.  We only call this when we are adding delay, in case it's been a
1663  * while since we added delay, and when we are checking to see if we need to
1664  * delay a task, to account for any delays that may have occurred.
1665  */
1666 static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
1667 {
1668         u64 old = atomic64_read(&blkg->delay_start);
1669
1670         /*
1671          * We only want to scale down every second.  The idea here is that we
1672          * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
1673          * time window.  We only want to throttle tasks for recent delay that
1674          * has occurred, in 1 second time windows since that's the maximum
1675          * things can be throttled.  We save the current delay window in
1676          * blkg->last_delay so we know what amount is still left to be charged
1677          * to the blkg from this point onward.  blkg->last_use keeps track of
1678          * the use_delay counter.  The idea is if we're unthrottling the blkg we
1679          * are ok with whatever is happening now, and we can take away more of
1680          * the accumulated delay as we've already throttled enough that
1681          * everybody is happy with their IO latencies.
1682          */
1683         if (time_before64(old + NSEC_PER_SEC, now) &&
1684             atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
1685                 u64 cur = atomic64_read(&blkg->delay_nsec);
1686                 u64 sub = min_t(u64, blkg->last_delay, now - old);
1687                 int cur_use = atomic_read(&blkg->use_delay);
1688
1689                 /*
1690                  * We've been unthrottled, subtract a larger chunk of our
1691                  * accumulated delay.
1692                  */
1693                 if (cur_use < blkg->last_use)
1694                         sub = max_t(u64, sub, blkg->last_delay >> 1);
1695
1696                 /*
1697                  * This shouldn't happen, but handle it anyway.  Our delay_nsec
1698                  * should only ever be growing except here where we subtract out
1699                  * min(last_delay, 1 second), but lord knows bugs happen and I'd
1700                  * rather not end up with negative numbers.
1701                  */
1702                 if (unlikely(cur < sub)) {
1703                         atomic64_set(&blkg->delay_nsec, 0);
1704                         blkg->last_delay = 0;
1705                 } else {
1706                         atomic64_sub(sub, &blkg->delay_nsec);
1707                         blkg->last_delay = cur - sub;
1708                 }
1709                 blkg->last_use = cur_use;
1710         }
1711 }
1712
1713 /*
1714  * This is called when we want to actually walk up the hierarchy and check to
1715  * see if we need to throttle, and then actually throttle if there is some
1716  * accumulated delay.  This should only be called upon return to user space so
1717  * we're not holding some lock that would induce a priority inversion.
1718  */
1719 static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1720 {
1721         u64 now = ktime_to_ns(ktime_get());
1722         u64 exp;
1723         u64 delay_nsec = 0;
1724         int tok;
1725
1726         while (blkg->parent) {
1727                 if (atomic_read(&blkg->use_delay)) {
1728                         blkcg_scale_delay(blkg, now);
1729                         delay_nsec = max_t(u64, delay_nsec,
1730                                            atomic64_read(&blkg->delay_nsec));
1731                 }
1732                 blkg = blkg->parent;
1733         }
1734
1735         if (!delay_nsec)
1736                 return;
1737
1738         /*
1739          * Let's not sleep for all eternity if we've amassed a huge delay.
1740          * Swapping or metadata IO can accumulate 10's of seconds worth of
1741          * delay, and we want userspace to be able to do _something_ so cap the
1742          * delays at 1 second.  If there's 10's of seconds worth of delay then
1743          * the tasks will be delayed for 1 second for every syscall.
1744          */
1745         delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1746
1747         /*
1748          * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
1749          * that hasn't landed upstream yet.  Once that stuff is in place we need
1750          * to do a psi_memstall_enter/leave if memdelay is set.
1751          */
1752
1753         exp = ktime_add_ns(now, delay_nsec);
1754         tok = io_schedule_prepare();
1755         do {
1756                 __set_current_state(TASK_KILLABLE);
1757                 if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
1758                         break;
1759         } while (!fatal_signal_pending(current));
1760         io_schedule_finish(tok);
1761 }
1762
1763 /**
1764  * blkcg_maybe_throttle_current - throttle the current task if it has been marked
1765  *
1766  * This is only called if we've been marked with set_notify_resume().  Obviously
1767  * we can be set_notify_resume() for reasons other than blkcg throttling, so we
1768  * check to see if current->throttle_queue is set and if not this doesn't do
1769  * anything.  This should only ever be called by the resume code, it's not meant
1770  * to be called by people willy-nilly as it will actually do the work to
1771  * throttle the task if it is setup for throttling.
1772  */
1773 void blkcg_maybe_throttle_current(void)
1774 {
1775         struct request_queue *q = current->throttle_queue;
1776         struct cgroup_subsys_state *css;
1777         struct blkcg *blkcg;
1778         struct blkcg_gq *blkg;
1779         bool use_memdelay = current->use_memdelay;
1780
1781         if (!q)
1782                 return;
1783
1784         current->throttle_queue = NULL;
1785         current->use_memdelay = false;
1786
1787         rcu_read_lock();
1788         css = kthread_blkcg();
1789         if (css)
1790                 blkcg = css_to_blkcg(css);
1791         else
1792                 blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
1793
1794         if (!blkcg)
1795                 goto out;
1796         blkg = blkg_lookup(blkcg, q);
1797         if (!blkg)
1798                 goto out;
1799         if (!blkg_tryget(blkg))
1800                 goto out;
1801         rcu_read_unlock();
1802
1803         blkcg_maybe_throttle_blkg(blkg, use_memdelay);
1804         blkg_put(blkg);
1805         blk_put_queue(q);
1806         return;
1807 out:
1808         rcu_read_unlock();
1809         blk_put_queue(q);
1810 }
1811 EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
1812
1813 /**
1814  * blkcg_schedule_throttle - this task needs to check for throttling
1815  * @q - the request queue IO was submitted on
1816  * @use_memdelay - do we charge this to memory delay for PSI
1817  *
1818  * This is called by the IO controller when we know there's delay accumulated
1819  * for the blkg for this task.  We do not pass the blkg because there are places
1820  * we call this that may not have that information, the swapping code for
1821  * instance will only have a request_queue at that point.  This set's the
1822  * notify_resume for the task to check and see if it requires throttling before
1823  * returning to user space.
1824  *
1825  * We will only schedule once per syscall.  You can call this over and over
1826  * again and it will only do the check once upon return to user space, and only
1827  * throttle once.  If the task needs to be throttled again it'll need to be
1828  * re-set at the next time we see the task.
1829  */
1830 void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
1831 {
1832         if (unlikely(current->flags & PF_KTHREAD))
1833                 return;
1834
1835         if (!blk_get_queue(q))
1836                 return;
1837
1838         if (current->throttle_queue)
1839                 blk_put_queue(current->throttle_queue);
1840         current->throttle_queue = q;
1841         if (use_memdelay)
1842                 current->use_memdelay = use_memdelay;
1843         set_notify_resume(current);
1844 }
1845 EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
1846
1847 /**
1848  * blkcg_add_delay - add delay to this blkg
1849  * @now - the current time in nanoseconds
1850  * @delta - how many nanoseconds of delay to add
1851  *
1852  * Charge @delta to the blkg's current delay accumulation.  This is used to
1853  * throttle tasks if an IO controller thinks we need more throttling.
1854  */
1855 void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1856 {
1857         blkcg_scale_delay(blkg, now);
1858         atomic64_add(delta, &blkg->delay_nsec);
1859 }
1860 EXPORT_SYMBOL_GPL(blkcg_add_delay);
1861
1862 module_param(blkcg_debug_stats, bool, 0644);
1863 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");