Merge tag 'mailbox-v4.20' of git://git.linaro.org/landing-teams/working/fujitsu/integ...
[sfrench/cifs-2.6.git] / block / blk-cgroup.c
1 /*
2  * Common Block IO controller cgroup interface
3  *
4  * Based on ideas and code from CFQ, CFS and BFQ:
5  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6  *
7  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8  *                    Paolo Valente <paolo.valente@unimore.it>
9  *
10  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11  *                    Nauman Rafique <nauman@google.com>
12  *
13  * For policy-specific per-blkcg data:
14  * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
15  *                    Arianna Avanzini <avanzini.arianna@gmail.com>
16  */
17 #include <linux/ioprio.h>
18 #include <linux/kdev_t.h>
19 #include <linux/module.h>
20 #include <linux/sched/signal.h>
21 #include <linux/err.h>
22 #include <linux/blkdev.h>
23 #include <linux/backing-dev.h>
24 #include <linux/slab.h>
25 #include <linux/genhd.h>
26 #include <linux/delay.h>
27 #include <linux/atomic.h>
28 #include <linux/ctype.h>
29 #include <linux/blk-cgroup.h>
30 #include <linux/tracehook.h>
31 #include "blk.h"
32
33 #define MAX_KEY_LEN 100
34
35 /*
36  * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
37  * blkcg_pol_register_mutex nests outside of it and synchronizes entire
38  * policy [un]register operations including cgroup file additions /
39  * removals.  Putting cgroup file registration outside blkcg_pol_mutex
40  * allows grabbing it from cgroup callbacks.
41  */
42 static DEFINE_MUTEX(blkcg_pol_register_mutex);
43 static DEFINE_MUTEX(blkcg_pol_mutex);
44
45 struct blkcg blkcg_root;
46 EXPORT_SYMBOL_GPL(blkcg_root);
47
48 struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
49
50 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
51
52 static LIST_HEAD(all_blkcgs);           /* protected by blkcg_pol_mutex */
53
54 static bool blkcg_debug_stats = false;
55
56 static bool blkcg_policy_enabled(struct request_queue *q,
57                                  const struct blkcg_policy *pol)
58 {
59         return pol && test_bit(pol->plid, q->blkcg_pols);
60 }
61
62 /**
63  * blkg_free - free a blkg
64  * @blkg: blkg to free
65  *
66  * Free @blkg which may be partially allocated.
67  */
68 static void blkg_free(struct blkcg_gq *blkg)
69 {
70         int i;
71
72         if (!blkg)
73                 return;
74
75         for (i = 0; i < BLKCG_MAX_POLS; i++)
76                 if (blkg->pd[i])
77                         blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
78
79         if (blkg->blkcg != &blkcg_root)
80                 blk_exit_rl(blkg->q, &blkg->rl);
81
82         blkg_rwstat_exit(&blkg->stat_ios);
83         blkg_rwstat_exit(&blkg->stat_bytes);
84         kfree(blkg);
85 }
86
87 static void __blkg_release(struct rcu_head *rcu)
88 {
89         struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
90
91         percpu_ref_exit(&blkg->refcnt);
92
93         /* release the blkcg and parent blkg refs this blkg has been holding */
94         css_put(&blkg->blkcg->css);
95         if (blkg->parent)
96                 blkg_put(blkg->parent);
97
98         wb_congested_put(blkg->wb_congested);
99
100         blkg_free(blkg);
101 }
102
103 /*
104  * A group is RCU protected, but having an rcu lock does not mean that one
105  * can access all the fields of blkg and assume these are valid.  For
106  * example, don't try to follow throtl_data and request queue links.
107  *
108  * Having a reference to blkg under an rcu allows accesses to only values
109  * local to groups like group stats and group rate limits.
110  */
111 static void blkg_release(struct percpu_ref *ref)
112 {
113         struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
114
115         call_rcu(&blkg->rcu_head, __blkg_release);
116 }
117
118 /**
119  * blkg_alloc - allocate a blkg
120  * @blkcg: block cgroup the new blkg is associated with
121  * @q: request_queue the new blkg is associated with
122  * @gfp_mask: allocation mask to use
123  *
124  * Allocate a new blkg assocating @blkcg and @q.
125  */
126 static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
127                                    gfp_t gfp_mask)
128 {
129         struct blkcg_gq *blkg;
130         int i;
131
132         /* alloc and init base part */
133         blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
134         if (!blkg)
135                 return NULL;
136
137         if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
138             blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
139                 goto err_free;
140
141         blkg->q = q;
142         INIT_LIST_HEAD(&blkg->q_node);
143         blkg->blkcg = blkcg;
144
145         /* root blkg uses @q->root_rl, init rl only for !root blkgs */
146         if (blkcg != &blkcg_root) {
147                 if (blk_init_rl(&blkg->rl, q, gfp_mask))
148                         goto err_free;
149                 blkg->rl.blkg = blkg;
150         }
151
152         for (i = 0; i < BLKCG_MAX_POLS; i++) {
153                 struct blkcg_policy *pol = blkcg_policy[i];
154                 struct blkg_policy_data *pd;
155
156                 if (!blkcg_policy_enabled(q, pol))
157                         continue;
158
159                 /* alloc per-policy data and attach it to blkg */
160                 pd = pol->pd_alloc_fn(gfp_mask, q->node);
161                 if (!pd)
162                         goto err_free;
163
164                 blkg->pd[i] = pd;
165                 pd->blkg = blkg;
166                 pd->plid = i;
167         }
168
169         return blkg;
170
171 err_free:
172         blkg_free(blkg);
173         return NULL;
174 }
175
176 struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
177                                       struct request_queue *q, bool update_hint)
178 {
179         struct blkcg_gq *blkg;
180
181         /*
182          * Hint didn't match.  Look up from the radix tree.  Note that the
183          * hint can only be updated under queue_lock as otherwise @blkg
184          * could have already been removed from blkg_tree.  The caller is
185          * responsible for grabbing queue_lock if @update_hint.
186          */
187         blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
188         if (blkg && blkg->q == q) {
189                 if (update_hint) {
190                         lockdep_assert_held(q->queue_lock);
191                         rcu_assign_pointer(blkcg->blkg_hint, blkg);
192                 }
193                 return blkg;
194         }
195
196         return NULL;
197 }
198 EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
199
200 /*
201  * If @new_blkg is %NULL, this function tries to allocate a new one as
202  * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
203  */
204 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
205                                     struct request_queue *q,
206                                     struct blkcg_gq *new_blkg)
207 {
208         struct blkcg_gq *blkg;
209         struct bdi_writeback_congested *wb_congested;
210         int i, ret;
211
212         WARN_ON_ONCE(!rcu_read_lock_held());
213         lockdep_assert_held(q->queue_lock);
214
215         /* blkg holds a reference to blkcg */
216         if (!css_tryget_online(&blkcg->css)) {
217                 ret = -ENODEV;
218                 goto err_free_blkg;
219         }
220
221         wb_congested = wb_congested_get_create(q->backing_dev_info,
222                                                blkcg->css.id,
223                                                GFP_NOWAIT | __GFP_NOWARN);
224         if (!wb_congested) {
225                 ret = -ENOMEM;
226                 goto err_put_css;
227         }
228
229         /* allocate */
230         if (!new_blkg) {
231                 new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
232                 if (unlikely(!new_blkg)) {
233                         ret = -ENOMEM;
234                         goto err_put_congested;
235                 }
236         }
237         blkg = new_blkg;
238         blkg->wb_congested = wb_congested;
239
240         /* link parent */
241         if (blkcg_parent(blkcg)) {
242                 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
243                 if (WARN_ON_ONCE(!blkg->parent)) {
244                         ret = -ENODEV;
245                         goto err_put_congested;
246                 }
247                 blkg_get(blkg->parent);
248         }
249
250         ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0,
251                               GFP_NOWAIT | __GFP_NOWARN);
252         if (ret)
253                 goto err_cancel_ref;
254
255         /* invoke per-policy init */
256         for (i = 0; i < BLKCG_MAX_POLS; i++) {
257                 struct blkcg_policy *pol = blkcg_policy[i];
258
259                 if (blkg->pd[i] && pol->pd_init_fn)
260                         pol->pd_init_fn(blkg->pd[i]);
261         }
262
263         /* insert */
264         spin_lock(&blkcg->lock);
265         ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
266         if (likely(!ret)) {
267                 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
268                 list_add(&blkg->q_node, &q->blkg_list);
269
270                 for (i = 0; i < BLKCG_MAX_POLS; i++) {
271                         struct blkcg_policy *pol = blkcg_policy[i];
272
273                         if (blkg->pd[i] && pol->pd_online_fn)
274                                 pol->pd_online_fn(blkg->pd[i]);
275                 }
276         }
277         blkg->online = true;
278         spin_unlock(&blkcg->lock);
279
280         if (!ret)
281                 return blkg;
282
283         /* @blkg failed fully initialized, use the usual release path */
284         blkg_put(blkg);
285         return ERR_PTR(ret);
286
287 err_cancel_ref:
288         percpu_ref_exit(&blkg->refcnt);
289 err_put_congested:
290         wb_congested_put(wb_congested);
291 err_put_css:
292         css_put(&blkcg->css);
293 err_free_blkg:
294         blkg_free(new_blkg);
295         return ERR_PTR(ret);
296 }
297
298 /**
299  * __blkg_lookup_create - lookup blkg, try to create one if not there
300  * @blkcg: blkcg of interest
301  * @q: request_queue of interest
302  *
303  * Lookup blkg for the @blkcg - @q pair.  If it doesn't exist, try to
304  * create one.  blkg creation is performed recursively from blkcg_root such
305  * that all non-root blkg's have access to the parent blkg.  This function
306  * should be called under RCU read lock and @q->queue_lock.
307  *
308  * Returns the blkg or the closest blkg if blkg_create fails as it walks
309  * down from root.
310  */
311 struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
312                                       struct request_queue *q)
313 {
314         struct blkcg_gq *blkg;
315
316         WARN_ON_ONCE(!rcu_read_lock_held());
317         lockdep_assert_held(q->queue_lock);
318
319         /*
320          * This could be the first entry point of blkcg implementation and
321          * we shouldn't allow anything to go through for a bypassing queue.
322          */
323         if (unlikely(blk_queue_bypass(q)))
324                 return q->root_blkg;
325
326         blkg = __blkg_lookup(blkcg, q, true);
327         if (blkg)
328                 return blkg;
329
330         /*
331          * Create blkgs walking down from blkcg_root to @blkcg, so that all
332          * non-root blkgs have access to their parents.  Returns the closest
333          * blkg to the intended blkg should blkg_create() fail.
334          */
335         while (true) {
336                 struct blkcg *pos = blkcg;
337                 struct blkcg *parent = blkcg_parent(blkcg);
338                 struct blkcg_gq *ret_blkg = q->root_blkg;
339
340                 while (parent) {
341                         blkg = __blkg_lookup(parent, q, false);
342                         if (blkg) {
343                                 /* remember closest blkg */
344                                 ret_blkg = blkg;
345                                 break;
346                         }
347                         pos = parent;
348                         parent = blkcg_parent(parent);
349                 }
350
351                 blkg = blkg_create(pos, q, NULL);
352                 if (IS_ERR(blkg))
353                         return ret_blkg;
354                 if (pos == blkcg)
355                         return blkg;
356         }
357 }
358
359 /**
360  * blkg_lookup_create - find or create a blkg
361  * @blkcg: target block cgroup
362  * @q: target request_queue
363  *
364  * This looks up or creates the blkg representing the unique pair
365  * of the blkcg and the request_queue.
366  */
367 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
368                                     struct request_queue *q)
369 {
370         struct blkcg_gq *blkg = blkg_lookup(blkcg, q);
371         unsigned long flags;
372
373         if (unlikely(!blkg)) {
374                 spin_lock_irqsave(q->queue_lock, flags);
375
376                 blkg = __blkg_lookup_create(blkcg, q);
377
378                 spin_unlock_irqrestore(q->queue_lock, flags);
379         }
380
381         return blkg;
382 }
383
384 static void blkg_destroy(struct blkcg_gq *blkg)
385 {
386         struct blkcg *blkcg = blkg->blkcg;
387         struct blkcg_gq *parent = blkg->parent;
388         int i;
389
390         lockdep_assert_held(blkg->q->queue_lock);
391         lockdep_assert_held(&blkcg->lock);
392
393         /* Something wrong if we are trying to remove same group twice */
394         WARN_ON_ONCE(list_empty(&blkg->q_node));
395         WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
396
397         for (i = 0; i < BLKCG_MAX_POLS; i++) {
398                 struct blkcg_policy *pol = blkcg_policy[i];
399
400                 if (blkg->pd[i] && pol->pd_offline_fn)
401                         pol->pd_offline_fn(blkg->pd[i]);
402         }
403
404         if (parent) {
405                 blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
406                 blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
407         }
408
409         blkg->online = false;
410
411         radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
412         list_del_init(&blkg->q_node);
413         hlist_del_init_rcu(&blkg->blkcg_node);
414
415         /*
416          * Both setting lookup hint to and clearing it from @blkg are done
417          * under queue_lock.  If it's not pointing to @blkg now, it never
418          * will.  Hint assignment itself can race safely.
419          */
420         if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
421                 rcu_assign_pointer(blkcg->blkg_hint, NULL);
422
423         /*
424          * Put the reference taken at the time of creation so that when all
425          * queues are gone, group can be destroyed.
426          */
427         percpu_ref_kill(&blkg->refcnt);
428 }
429
430 /**
431  * blkg_destroy_all - destroy all blkgs associated with a request_queue
432  * @q: request_queue of interest
433  *
434  * Destroy all blkgs associated with @q.
435  */
436 static void blkg_destroy_all(struct request_queue *q)
437 {
438         struct blkcg_gq *blkg, *n;
439
440         lockdep_assert_held(q->queue_lock);
441
442         list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
443                 struct blkcg *blkcg = blkg->blkcg;
444
445                 spin_lock(&blkcg->lock);
446                 blkg_destroy(blkg);
447                 spin_unlock(&blkcg->lock);
448         }
449
450         q->root_blkg = NULL;
451         q->root_rl.blkg = NULL;
452 }
453
454 /*
455  * The next function used by blk_queue_for_each_rl().  It's a bit tricky
456  * because the root blkg uses @q->root_rl instead of its own rl.
457  */
458 struct request_list *__blk_queue_next_rl(struct request_list *rl,
459                                          struct request_queue *q)
460 {
461         struct list_head *ent;
462         struct blkcg_gq *blkg;
463
464         /*
465          * Determine the current blkg list_head.  The first entry is
466          * root_rl which is off @q->blkg_list and mapped to the head.
467          */
468         if (rl == &q->root_rl) {
469                 ent = &q->blkg_list;
470                 /* There are no more block groups, hence no request lists */
471                 if (list_empty(ent))
472                         return NULL;
473         } else {
474                 blkg = container_of(rl, struct blkcg_gq, rl);
475                 ent = &blkg->q_node;
476         }
477
478         /* walk to the next list_head, skip root blkcg */
479         ent = ent->next;
480         if (ent == &q->root_blkg->q_node)
481                 ent = ent->next;
482         if (ent == &q->blkg_list)
483                 return NULL;
484
485         blkg = container_of(ent, struct blkcg_gq, q_node);
486         return &blkg->rl;
487 }
488
489 static int blkcg_reset_stats(struct cgroup_subsys_state *css,
490                              struct cftype *cftype, u64 val)
491 {
492         struct blkcg *blkcg = css_to_blkcg(css);
493         struct blkcg_gq *blkg;
494         int i;
495
496         mutex_lock(&blkcg_pol_mutex);
497         spin_lock_irq(&blkcg->lock);
498
499         /*
500          * Note that stat reset is racy - it doesn't synchronize against
501          * stat updates.  This is a debug feature which shouldn't exist
502          * anyway.  If you get hit by a race, retry.
503          */
504         hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
505                 blkg_rwstat_reset(&blkg->stat_bytes);
506                 blkg_rwstat_reset(&blkg->stat_ios);
507
508                 for (i = 0; i < BLKCG_MAX_POLS; i++) {
509                         struct blkcg_policy *pol = blkcg_policy[i];
510
511                         if (blkg->pd[i] && pol->pd_reset_stats_fn)
512                                 pol->pd_reset_stats_fn(blkg->pd[i]);
513                 }
514         }
515
516         spin_unlock_irq(&blkcg->lock);
517         mutex_unlock(&blkcg_pol_mutex);
518         return 0;
519 }
520
521 const char *blkg_dev_name(struct blkcg_gq *blkg)
522 {
523         /* some drivers (floppy) instantiate a queue w/o disk registered */
524         if (blkg->q->backing_dev_info->dev)
525                 return dev_name(blkg->q->backing_dev_info->dev);
526         return NULL;
527 }
528 EXPORT_SYMBOL_GPL(blkg_dev_name);
529
530 /**
531  * blkcg_print_blkgs - helper for printing per-blkg data
532  * @sf: seq_file to print to
533  * @blkcg: blkcg of interest
534  * @prfill: fill function to print out a blkg
535  * @pol: policy in question
536  * @data: data to be passed to @prfill
537  * @show_total: to print out sum of prfill return values or not
538  *
539  * This function invokes @prfill on each blkg of @blkcg if pd for the
540  * policy specified by @pol exists.  @prfill is invoked with @sf, the
541  * policy data and @data and the matching queue lock held.  If @show_total
542  * is %true, the sum of the return values from @prfill is printed with
543  * "Total" label at the end.
544  *
545  * This is to be used to construct print functions for
546  * cftype->read_seq_string method.
547  */
548 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
549                        u64 (*prfill)(struct seq_file *,
550                                      struct blkg_policy_data *, int),
551                        const struct blkcg_policy *pol, int data,
552                        bool show_total)
553 {
554         struct blkcg_gq *blkg;
555         u64 total = 0;
556
557         rcu_read_lock();
558         hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
559                 spin_lock_irq(blkg->q->queue_lock);
560                 if (blkcg_policy_enabled(blkg->q, pol))
561                         total += prfill(sf, blkg->pd[pol->plid], data);
562                 spin_unlock_irq(blkg->q->queue_lock);
563         }
564         rcu_read_unlock();
565
566         if (show_total)
567                 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
568 }
569 EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
570
571 /**
572  * __blkg_prfill_u64 - prfill helper for a single u64 value
573  * @sf: seq_file to print to
574  * @pd: policy private data of interest
575  * @v: value to print
576  *
577  * Print @v to @sf for the device assocaited with @pd.
578  */
579 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
580 {
581         const char *dname = blkg_dev_name(pd->blkg);
582
583         if (!dname)
584                 return 0;
585
586         seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
587         return v;
588 }
589 EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
590
591 /**
592  * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
593  * @sf: seq_file to print to
594  * @pd: policy private data of interest
595  * @rwstat: rwstat to print
596  *
597  * Print @rwstat to @sf for the device assocaited with @pd.
598  */
599 u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
600                          const struct blkg_rwstat *rwstat)
601 {
602         static const char *rwstr[] = {
603                 [BLKG_RWSTAT_READ]      = "Read",
604                 [BLKG_RWSTAT_WRITE]     = "Write",
605                 [BLKG_RWSTAT_SYNC]      = "Sync",
606                 [BLKG_RWSTAT_ASYNC]     = "Async",
607                 [BLKG_RWSTAT_DISCARD]   = "Discard",
608         };
609         const char *dname = blkg_dev_name(pd->blkg);
610         u64 v;
611         int i;
612
613         if (!dname)
614                 return 0;
615
616         for (i = 0; i < BLKG_RWSTAT_NR; i++)
617                 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
618                            (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
619
620         v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
621                 atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
622                 atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
623         seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
624         return v;
625 }
626 EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
627
628 /**
629  * blkg_prfill_stat - prfill callback for blkg_stat
630  * @sf: seq_file to print to
631  * @pd: policy private data of interest
632  * @off: offset to the blkg_stat in @pd
633  *
634  * prfill callback for printing a blkg_stat.
635  */
636 u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
637 {
638         return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
639 }
640 EXPORT_SYMBOL_GPL(blkg_prfill_stat);
641
642 /**
643  * blkg_prfill_rwstat - prfill callback for blkg_rwstat
644  * @sf: seq_file to print to
645  * @pd: policy private data of interest
646  * @off: offset to the blkg_rwstat in @pd
647  *
648  * prfill callback for printing a blkg_rwstat.
649  */
650 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
651                        int off)
652 {
653         struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
654
655         return __blkg_prfill_rwstat(sf, pd, &rwstat);
656 }
657 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
658
659 static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
660                                     struct blkg_policy_data *pd, int off)
661 {
662         struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
663
664         return __blkg_prfill_rwstat(sf, pd, &rwstat);
665 }
666
667 /**
668  * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
669  * @sf: seq_file to print to
670  * @v: unused
671  *
672  * To be used as cftype->seq_show to print blkg->stat_bytes.
673  * cftype->private must be set to the blkcg_policy.
674  */
675 int blkg_print_stat_bytes(struct seq_file *sf, void *v)
676 {
677         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
678                           blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
679                           offsetof(struct blkcg_gq, stat_bytes), true);
680         return 0;
681 }
682 EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
683
684 /**
685  * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
686  * @sf: seq_file to print to
687  * @v: unused
688  *
689  * To be used as cftype->seq_show to print blkg->stat_ios.  cftype->private
690  * must be set to the blkcg_policy.
691  */
692 int blkg_print_stat_ios(struct seq_file *sf, void *v)
693 {
694         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
695                           blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
696                           offsetof(struct blkcg_gq, stat_ios), true);
697         return 0;
698 }
699 EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
700
701 static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
702                                               struct blkg_policy_data *pd,
703                                               int off)
704 {
705         struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
706                                                               NULL, off);
707         return __blkg_prfill_rwstat(sf, pd, &rwstat);
708 }
709
710 /**
711  * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
712  * @sf: seq_file to print to
713  * @v: unused
714  */
715 int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
716 {
717         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
718                           blkg_prfill_rwstat_field_recursive,
719                           (void *)seq_cft(sf)->private,
720                           offsetof(struct blkcg_gq, stat_bytes), true);
721         return 0;
722 }
723 EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
724
725 /**
726  * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
727  * @sf: seq_file to print to
728  * @v: unused
729  */
730 int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
731 {
732         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
733                           blkg_prfill_rwstat_field_recursive,
734                           (void *)seq_cft(sf)->private,
735                           offsetof(struct blkcg_gq, stat_ios), true);
736         return 0;
737 }
738 EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
739
740 /**
741  * blkg_stat_recursive_sum - collect hierarchical blkg_stat
742  * @blkg: blkg of interest
743  * @pol: blkcg_policy which contains the blkg_stat
744  * @off: offset to the blkg_stat in blkg_policy_data or @blkg
745  *
746  * Collect the blkg_stat specified by @blkg, @pol and @off and all its
747  * online descendants and their aux counts.  The caller must be holding the
748  * queue lock for online tests.
749  *
750  * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
751  * at @off bytes into @blkg's blkg_policy_data of the policy.
752  */
753 u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
754                             struct blkcg_policy *pol, int off)
755 {
756         struct blkcg_gq *pos_blkg;
757         struct cgroup_subsys_state *pos_css;
758         u64 sum = 0;
759
760         lockdep_assert_held(blkg->q->queue_lock);
761
762         rcu_read_lock();
763         blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
764                 struct blkg_stat *stat;
765
766                 if (!pos_blkg->online)
767                         continue;
768
769                 if (pol)
770                         stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
771                 else
772                         stat = (void *)blkg + off;
773
774                 sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
775         }
776         rcu_read_unlock();
777
778         return sum;
779 }
780 EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
781
782 /**
783  * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
784  * @blkg: blkg of interest
785  * @pol: blkcg_policy which contains the blkg_rwstat
786  * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
787  *
788  * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
789  * online descendants and their aux counts.  The caller must be holding the
790  * queue lock for online tests.
791  *
792  * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
793  * is at @off bytes into @blkg's blkg_policy_data of the policy.
794  */
795 struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
796                                              struct blkcg_policy *pol, int off)
797 {
798         struct blkcg_gq *pos_blkg;
799         struct cgroup_subsys_state *pos_css;
800         struct blkg_rwstat sum = { };
801         int i;
802
803         lockdep_assert_held(blkg->q->queue_lock);
804
805         rcu_read_lock();
806         blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
807                 struct blkg_rwstat *rwstat;
808
809                 if (!pos_blkg->online)
810                         continue;
811
812                 if (pol)
813                         rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
814                 else
815                         rwstat = (void *)pos_blkg + off;
816
817                 for (i = 0; i < BLKG_RWSTAT_NR; i++)
818                         atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
819                                 percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
820                                 &sum.aux_cnt[i]);
821         }
822         rcu_read_unlock();
823
824         return sum;
825 }
826 EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
827
828 /* Performs queue bypass and policy enabled checks then looks up blkg. */
829 static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
830                                           const struct blkcg_policy *pol,
831                                           struct request_queue *q)
832 {
833         WARN_ON_ONCE(!rcu_read_lock_held());
834         lockdep_assert_held(q->queue_lock);
835
836         if (!blkcg_policy_enabled(q, pol))
837                 return ERR_PTR(-EOPNOTSUPP);
838
839         /*
840          * This could be the first entry point of blkcg implementation and
841          * we shouldn't allow anything to go through for a bypassing queue.
842          */
843         if (unlikely(blk_queue_bypass(q)))
844                 return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
845
846         return __blkg_lookup(blkcg, q, true /* update_hint */);
847 }
848
849 /**
850  * blkg_conf_prep - parse and prepare for per-blkg config update
851  * @blkcg: target block cgroup
852  * @pol: target policy
853  * @input: input string
854  * @ctx: blkg_conf_ctx to be filled
855  *
856  * Parse per-blkg config update from @input and initialize @ctx with the
857  * result.  @ctx->blkg points to the blkg to be updated and @ctx->body the
858  * part of @input following MAJ:MIN.  This function returns with RCU read
859  * lock and queue lock held and must be paired with blkg_conf_finish().
860  */
861 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
862                    char *input, struct blkg_conf_ctx *ctx)
863         __acquires(rcu) __acquires(disk->queue->queue_lock)
864 {
865         struct gendisk *disk;
866         struct request_queue *q;
867         struct blkcg_gq *blkg;
868         unsigned int major, minor;
869         int key_len, part, ret;
870         char *body;
871
872         if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
873                 return -EINVAL;
874
875         body = input + key_len;
876         if (!isspace(*body))
877                 return -EINVAL;
878         body = skip_spaces(body);
879
880         disk = get_gendisk(MKDEV(major, minor), &part);
881         if (!disk)
882                 return -ENODEV;
883         if (part) {
884                 ret = -ENODEV;
885                 goto fail;
886         }
887
888         q = disk->queue;
889
890         rcu_read_lock();
891         spin_lock_irq(q->queue_lock);
892
893         blkg = blkg_lookup_check(blkcg, pol, q);
894         if (IS_ERR(blkg)) {
895                 ret = PTR_ERR(blkg);
896                 goto fail_unlock;
897         }
898
899         if (blkg)
900                 goto success;
901
902         /*
903          * Create blkgs walking down from blkcg_root to @blkcg, so that all
904          * non-root blkgs have access to their parents.
905          */
906         while (true) {
907                 struct blkcg *pos = blkcg;
908                 struct blkcg *parent;
909                 struct blkcg_gq *new_blkg;
910
911                 parent = blkcg_parent(blkcg);
912                 while (parent && !__blkg_lookup(parent, q, false)) {
913                         pos = parent;
914                         parent = blkcg_parent(parent);
915                 }
916
917                 /* Drop locks to do new blkg allocation with GFP_KERNEL. */
918                 spin_unlock_irq(q->queue_lock);
919                 rcu_read_unlock();
920
921                 new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
922                 if (unlikely(!new_blkg)) {
923                         ret = -ENOMEM;
924                         goto fail;
925                 }
926
927                 rcu_read_lock();
928                 spin_lock_irq(q->queue_lock);
929
930                 blkg = blkg_lookup_check(pos, pol, q);
931                 if (IS_ERR(blkg)) {
932                         ret = PTR_ERR(blkg);
933                         goto fail_unlock;
934                 }
935
936                 if (blkg) {
937                         blkg_free(new_blkg);
938                 } else {
939                         blkg = blkg_create(pos, q, new_blkg);
940                         if (unlikely(IS_ERR(blkg))) {
941                                 ret = PTR_ERR(blkg);
942                                 goto fail_unlock;
943                         }
944                 }
945
946                 if (pos == blkcg)
947                         goto success;
948         }
949 success:
950         ctx->disk = disk;
951         ctx->blkg = blkg;
952         ctx->body = body;
953         return 0;
954
955 fail_unlock:
956         spin_unlock_irq(q->queue_lock);
957         rcu_read_unlock();
958 fail:
959         put_disk_and_module(disk);
960         /*
961          * If queue was bypassing, we should retry.  Do so after a
962          * short msleep().  It isn't strictly necessary but queue
963          * can be bypassing for some time and it's always nice to
964          * avoid busy looping.
965          */
966         if (ret == -EBUSY) {
967                 msleep(10);
968                 ret = restart_syscall();
969         }
970         return ret;
971 }
972 EXPORT_SYMBOL_GPL(blkg_conf_prep);
973
974 /**
975  * blkg_conf_finish - finish up per-blkg config update
976  * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
977  *
978  * Finish up after per-blkg config update.  This function must be paired
979  * with blkg_conf_prep().
980  */
981 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
982         __releases(ctx->disk->queue->queue_lock) __releases(rcu)
983 {
984         spin_unlock_irq(ctx->disk->queue->queue_lock);
985         rcu_read_unlock();
986         put_disk_and_module(ctx->disk);
987 }
988 EXPORT_SYMBOL_GPL(blkg_conf_finish);
989
990 static int blkcg_print_stat(struct seq_file *sf, void *v)
991 {
992         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
993         struct blkcg_gq *blkg;
994
995         rcu_read_lock();
996
997         hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
998                 const char *dname;
999                 char *buf;
1000                 struct blkg_rwstat rwstat;
1001                 u64 rbytes, wbytes, rios, wios, dbytes, dios;
1002                 size_t size = seq_get_buf(sf, &buf), off = 0;
1003                 int i;
1004                 bool has_stats = false;
1005
1006                 dname = blkg_dev_name(blkg);
1007                 if (!dname)
1008                         continue;
1009
1010                 /*
1011                  * Hooray string manipulation, count is the size written NOT
1012                  * INCLUDING THE \0, so size is now count+1 less than what we
1013                  * had before, but we want to start writing the next bit from
1014                  * the \0 so we only add count to buf.
1015                  */
1016                 off += scnprintf(buf+off, size-off, "%s ", dname);
1017
1018                 spin_lock_irq(blkg->q->queue_lock);
1019
1020                 rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
1021                                         offsetof(struct blkcg_gq, stat_bytes));
1022                 rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
1023                 wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
1024                 dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
1025
1026                 rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
1027                                         offsetof(struct blkcg_gq, stat_ios));
1028                 rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
1029                 wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
1030                 dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
1031
1032                 spin_unlock_irq(blkg->q->queue_lock);
1033
1034                 if (rbytes || wbytes || rios || wios) {
1035                         has_stats = true;
1036                         off += scnprintf(buf+off, size-off,
1037                                          "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
1038                                          rbytes, wbytes, rios, wios,
1039                                          dbytes, dios);
1040                 }
1041
1042                 if (!blkcg_debug_stats)
1043                         goto next;
1044
1045                 if (atomic_read(&blkg->use_delay)) {
1046                         has_stats = true;
1047                         off += scnprintf(buf+off, size-off,
1048                                          " use_delay=%d delay_nsec=%llu",
1049                                          atomic_read(&blkg->use_delay),
1050                                         (unsigned long long)atomic64_read(&blkg->delay_nsec));
1051                 }
1052
1053                 for (i = 0; i < BLKCG_MAX_POLS; i++) {
1054                         struct blkcg_policy *pol = blkcg_policy[i];
1055                         size_t written;
1056
1057                         if (!blkg->pd[i] || !pol->pd_stat_fn)
1058                                 continue;
1059
1060                         written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
1061                         if (written)
1062                                 has_stats = true;
1063                         off += written;
1064                 }
1065 next:
1066                 if (has_stats) {
1067                         off += scnprintf(buf+off, size-off, "\n");
1068                         seq_commit(sf, off);
1069                 }
1070         }
1071
1072         rcu_read_unlock();
1073         return 0;
1074 }
1075
1076 static struct cftype blkcg_files[] = {
1077         {
1078                 .name = "stat",
1079                 .flags = CFTYPE_NOT_ON_ROOT,
1080                 .seq_show = blkcg_print_stat,
1081         },
1082         { }     /* terminate */
1083 };
1084
1085 static struct cftype blkcg_legacy_files[] = {
1086         {
1087                 .name = "reset_stats",
1088                 .write_u64 = blkcg_reset_stats,
1089         },
1090         { }     /* terminate */
1091 };
1092
1093 /*
1094  * blkcg destruction is a three-stage process.
1095  *
1096  * 1. Destruction starts.  The blkcg_css_offline() callback is invoked
1097  *    which offlines writeback.  Here we tie the next stage of blkg destruction
1098  *    to the completion of writeback associated with the blkcg.  This lets us
1099  *    avoid punting potentially large amounts of outstanding writeback to root
1100  *    while maintaining any ongoing policies.  The next stage is triggered when
1101  *    the nr_cgwbs count goes to zero.
1102  *
1103  * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called
1104  *    and handles the destruction of blkgs.  Here the css reference held by
1105  *    the blkg is put back eventually allowing blkcg_css_free() to be called.
1106  *    This work may occur in cgwb_release_workfn() on the cgwb_release
1107  *    workqueue.  Any submitted ios that fail to get the blkg ref will be
1108  *    punted to the root_blkg.
1109  *
1110  * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called.
1111  *    This finally frees the blkcg.
1112  */
1113
1114 /**
1115  * blkcg_css_offline - cgroup css_offline callback
1116  * @css: css of interest
1117  *
1118  * This function is called when @css is about to go away.  Here the cgwbs are
1119  * offlined first and only once writeback associated with the blkcg has
1120  * finished do we start step 2 (see above).
1121  */
1122 static void blkcg_css_offline(struct cgroup_subsys_state *css)
1123 {
1124         struct blkcg *blkcg = css_to_blkcg(css);
1125
1126         /* this prevents anyone from attaching or migrating to this blkcg */
1127         wb_blkcg_offline(blkcg);
1128
1129         /* put the base cgwb reference allowing step 2 to be triggered */
1130         blkcg_cgwb_put(blkcg);
1131 }
1132
1133 /**
1134  * blkcg_destroy_blkgs - responsible for shooting down blkgs
1135  * @blkcg: blkcg of interest
1136  *
1137  * blkgs should be removed while holding both q and blkcg locks.  As blkcg lock
1138  * is nested inside q lock, this function performs reverse double lock dancing.
1139  * Destroying the blkgs releases the reference held on the blkcg's css allowing
1140  * blkcg_css_free to eventually be called.
1141  *
1142  * This is the blkcg counterpart of ioc_release_fn().
1143  */
1144 void blkcg_destroy_blkgs(struct blkcg *blkcg)
1145 {
1146         spin_lock_irq(&blkcg->lock);
1147
1148         while (!hlist_empty(&blkcg->blkg_list)) {
1149                 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
1150                                                 struct blkcg_gq, blkcg_node);
1151                 struct request_queue *q = blkg->q;
1152
1153                 if (spin_trylock(q->queue_lock)) {
1154                         blkg_destroy(blkg);
1155                         spin_unlock(q->queue_lock);
1156                 } else {
1157                         spin_unlock_irq(&blkcg->lock);
1158                         cpu_relax();
1159                         spin_lock_irq(&blkcg->lock);
1160                 }
1161         }
1162
1163         spin_unlock_irq(&blkcg->lock);
1164 }
1165
1166 static void blkcg_css_free(struct cgroup_subsys_state *css)
1167 {
1168         struct blkcg *blkcg = css_to_blkcg(css);
1169         int i;
1170
1171         mutex_lock(&blkcg_pol_mutex);
1172
1173         list_del(&blkcg->all_blkcgs_node);
1174
1175         for (i = 0; i < BLKCG_MAX_POLS; i++)
1176                 if (blkcg->cpd[i])
1177                         blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1178
1179         mutex_unlock(&blkcg_pol_mutex);
1180
1181         kfree(blkcg);
1182 }
1183
1184 static struct cgroup_subsys_state *
1185 blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
1186 {
1187         struct blkcg *blkcg;
1188         struct cgroup_subsys_state *ret;
1189         int i;
1190
1191         mutex_lock(&blkcg_pol_mutex);
1192
1193         if (!parent_css) {
1194                 blkcg = &blkcg_root;
1195         } else {
1196                 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1197                 if (!blkcg) {
1198                         ret = ERR_PTR(-ENOMEM);
1199                         goto unlock;
1200                 }
1201         }
1202
1203         for (i = 0; i < BLKCG_MAX_POLS ; i++) {
1204                 struct blkcg_policy *pol = blkcg_policy[i];
1205                 struct blkcg_policy_data *cpd;
1206
1207                 /*
1208                  * If the policy hasn't been attached yet, wait for it
1209                  * to be attached before doing anything else. Otherwise,
1210                  * check if the policy requires any specific per-cgroup
1211                  * data: if it does, allocate and initialize it.
1212                  */
1213                 if (!pol || !pol->cpd_alloc_fn)
1214                         continue;
1215
1216                 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1217                 if (!cpd) {
1218                         ret = ERR_PTR(-ENOMEM);
1219                         goto free_pd_blkcg;
1220                 }
1221                 blkcg->cpd[i] = cpd;
1222                 cpd->blkcg = blkcg;
1223                 cpd->plid = i;
1224                 if (pol->cpd_init_fn)
1225                         pol->cpd_init_fn(cpd);
1226         }
1227
1228         spin_lock_init(&blkcg->lock);
1229         INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
1230         INIT_HLIST_HEAD(&blkcg->blkg_list);
1231 #ifdef CONFIG_CGROUP_WRITEBACK
1232         INIT_LIST_HEAD(&blkcg->cgwb_list);
1233         refcount_set(&blkcg->cgwb_refcnt, 1);
1234 #endif
1235         list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
1236
1237         mutex_unlock(&blkcg_pol_mutex);
1238         return &blkcg->css;
1239
1240 free_pd_blkcg:
1241         for (i--; i >= 0; i--)
1242                 if (blkcg->cpd[i])
1243                         blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1244
1245         if (blkcg != &blkcg_root)
1246                 kfree(blkcg);
1247 unlock:
1248         mutex_unlock(&blkcg_pol_mutex);
1249         return ret;
1250 }
1251
1252 /**
1253  * blkcg_init_queue - initialize blkcg part of request queue
1254  * @q: request_queue to initialize
1255  *
1256  * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
1257  * part of new request_queue @q.
1258  *
1259  * RETURNS:
1260  * 0 on success, -errno on failure.
1261  */
1262 int blkcg_init_queue(struct request_queue *q)
1263 {
1264         struct blkcg_gq *new_blkg, *blkg;
1265         bool preloaded;
1266         int ret;
1267
1268         new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
1269         if (!new_blkg)
1270                 return -ENOMEM;
1271
1272         preloaded = !radix_tree_preload(GFP_KERNEL);
1273
1274         /* Make sure the root blkg exists. */
1275         rcu_read_lock();
1276         spin_lock_irq(q->queue_lock);
1277         blkg = blkg_create(&blkcg_root, q, new_blkg);
1278         if (IS_ERR(blkg))
1279                 goto err_unlock;
1280         q->root_blkg = blkg;
1281         q->root_rl.blkg = blkg;
1282         spin_unlock_irq(q->queue_lock);
1283         rcu_read_unlock();
1284
1285         if (preloaded)
1286                 radix_tree_preload_end();
1287
1288         ret = blk_iolatency_init(q);
1289         if (ret) {
1290                 spin_lock_irq(q->queue_lock);
1291                 blkg_destroy_all(q);
1292                 spin_unlock_irq(q->queue_lock);
1293                 return ret;
1294         }
1295
1296         ret = blk_throtl_init(q);
1297         if (ret) {
1298                 spin_lock_irq(q->queue_lock);
1299                 blkg_destroy_all(q);
1300                 spin_unlock_irq(q->queue_lock);
1301         }
1302         return ret;
1303
1304 err_unlock:
1305         spin_unlock_irq(q->queue_lock);
1306         rcu_read_unlock();
1307         if (preloaded)
1308                 radix_tree_preload_end();
1309         return PTR_ERR(blkg);
1310 }
1311
1312 /**
1313  * blkcg_drain_queue - drain blkcg part of request_queue
1314  * @q: request_queue to drain
1315  *
1316  * Called from blk_drain_queue().  Responsible for draining blkcg part.
1317  */
1318 void blkcg_drain_queue(struct request_queue *q)
1319 {
1320         lockdep_assert_held(q->queue_lock);
1321
1322         /*
1323          * @q could be exiting and already have destroyed all blkgs as
1324          * indicated by NULL root_blkg.  If so, don't confuse policies.
1325          */
1326         if (!q->root_blkg)
1327                 return;
1328
1329         blk_throtl_drain(q);
1330 }
1331
1332 /**
1333  * blkcg_exit_queue - exit and release blkcg part of request_queue
1334  * @q: request_queue being released
1335  *
1336  * Called from blk_release_queue().  Responsible for exiting blkcg part.
1337  */
1338 void blkcg_exit_queue(struct request_queue *q)
1339 {
1340         spin_lock_irq(q->queue_lock);
1341         blkg_destroy_all(q);
1342         spin_unlock_irq(q->queue_lock);
1343
1344         blk_throtl_exit(q);
1345 }
1346
1347 /*
1348  * We cannot support shared io contexts, as we have no mean to support
1349  * two tasks with the same ioc in two different groups without major rework
1350  * of the main cic data structures.  For now we allow a task to change
1351  * its cgroup only if it's the only owner of its ioc.
1352  */
1353 static int blkcg_can_attach(struct cgroup_taskset *tset)
1354 {
1355         struct task_struct *task;
1356         struct cgroup_subsys_state *dst_css;
1357         struct io_context *ioc;
1358         int ret = 0;
1359
1360         /* task_lock() is needed to avoid races with exit_io_context() */
1361         cgroup_taskset_for_each(task, dst_css, tset) {
1362                 task_lock(task);
1363                 ioc = task->io_context;
1364                 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1365                         ret = -EINVAL;
1366                 task_unlock(task);
1367                 if (ret)
1368                         break;
1369         }
1370         return ret;
1371 }
1372
1373 static void blkcg_bind(struct cgroup_subsys_state *root_css)
1374 {
1375         int i;
1376
1377         mutex_lock(&blkcg_pol_mutex);
1378
1379         for (i = 0; i < BLKCG_MAX_POLS; i++) {
1380                 struct blkcg_policy *pol = blkcg_policy[i];
1381                 struct blkcg *blkcg;
1382
1383                 if (!pol || !pol->cpd_bind_fn)
1384                         continue;
1385
1386                 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
1387                         if (blkcg->cpd[pol->plid])
1388                                 pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
1389         }
1390         mutex_unlock(&blkcg_pol_mutex);
1391 }
1392
1393 static void blkcg_exit(struct task_struct *tsk)
1394 {
1395         if (tsk->throttle_queue)
1396                 blk_put_queue(tsk->throttle_queue);
1397         tsk->throttle_queue = NULL;
1398 }
1399
1400 struct cgroup_subsys io_cgrp_subsys = {
1401         .css_alloc = blkcg_css_alloc,
1402         .css_offline = blkcg_css_offline,
1403         .css_free = blkcg_css_free,
1404         .can_attach = blkcg_can_attach,
1405         .bind = blkcg_bind,
1406         .dfl_cftypes = blkcg_files,
1407         .legacy_cftypes = blkcg_legacy_files,
1408         .legacy_name = "blkio",
1409         .exit = blkcg_exit,
1410 #ifdef CONFIG_MEMCG
1411         /*
1412          * This ensures that, if available, memcg is automatically enabled
1413          * together on the default hierarchy so that the owner cgroup can
1414          * be retrieved from writeback pages.
1415          */
1416         .depends_on = 1 << memory_cgrp_id,
1417 #endif
1418 };
1419 EXPORT_SYMBOL_GPL(io_cgrp_subsys);
1420
1421 /**
1422  * blkcg_activate_policy - activate a blkcg policy on a request_queue
1423  * @q: request_queue of interest
1424  * @pol: blkcg policy to activate
1425  *
1426  * Activate @pol on @q.  Requires %GFP_KERNEL context.  @q goes through
1427  * bypass mode to populate its blkgs with policy_data for @pol.
1428  *
1429  * Activation happens with @q bypassed, so nobody would be accessing blkgs
1430  * from IO path.  Update of each blkg is protected by both queue and blkcg
1431  * locks so that holding either lock and testing blkcg_policy_enabled() is
1432  * always enough for dereferencing policy data.
1433  *
1434  * The caller is responsible for synchronizing [de]activations and policy
1435  * [un]registerations.  Returns 0 on success, -errno on failure.
1436  */
1437 int blkcg_activate_policy(struct request_queue *q,
1438                           const struct blkcg_policy *pol)
1439 {
1440         struct blkg_policy_data *pd_prealloc = NULL;
1441         struct blkcg_gq *blkg;
1442         int ret;
1443
1444         if (blkcg_policy_enabled(q, pol))
1445                 return 0;
1446
1447         if (q->mq_ops)
1448                 blk_mq_freeze_queue(q);
1449         else
1450                 blk_queue_bypass_start(q);
1451 pd_prealloc:
1452         if (!pd_prealloc) {
1453                 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
1454                 if (!pd_prealloc) {
1455                         ret = -ENOMEM;
1456                         goto out_bypass_end;
1457                 }
1458         }
1459
1460         spin_lock_irq(q->queue_lock);
1461
1462         list_for_each_entry(blkg, &q->blkg_list, q_node) {
1463                 struct blkg_policy_data *pd;
1464
1465                 if (blkg->pd[pol->plid])
1466                         continue;
1467
1468                 pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node);
1469                 if (!pd)
1470                         swap(pd, pd_prealloc);
1471                 if (!pd) {
1472                         spin_unlock_irq(q->queue_lock);
1473                         goto pd_prealloc;
1474                 }
1475
1476                 blkg->pd[pol->plid] = pd;
1477                 pd->blkg = blkg;
1478                 pd->plid = pol->plid;
1479                 if (pol->pd_init_fn)
1480                         pol->pd_init_fn(pd);
1481         }
1482
1483         __set_bit(pol->plid, q->blkcg_pols);
1484         ret = 0;
1485
1486         spin_unlock_irq(q->queue_lock);
1487 out_bypass_end:
1488         if (q->mq_ops)
1489                 blk_mq_unfreeze_queue(q);
1490         else
1491                 blk_queue_bypass_end(q);
1492         if (pd_prealloc)
1493                 pol->pd_free_fn(pd_prealloc);
1494         return ret;
1495 }
1496 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1497
1498 /**
1499  * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
1500  * @q: request_queue of interest
1501  * @pol: blkcg policy to deactivate
1502  *
1503  * Deactivate @pol on @q.  Follows the same synchronization rules as
1504  * blkcg_activate_policy().
1505  */
1506 void blkcg_deactivate_policy(struct request_queue *q,
1507                              const struct blkcg_policy *pol)
1508 {
1509         struct blkcg_gq *blkg;
1510
1511         if (!blkcg_policy_enabled(q, pol))
1512                 return;
1513
1514         if (q->mq_ops)
1515                 blk_mq_freeze_queue(q);
1516         else
1517                 blk_queue_bypass_start(q);
1518
1519         spin_lock_irq(q->queue_lock);
1520
1521         __clear_bit(pol->plid, q->blkcg_pols);
1522
1523         list_for_each_entry(blkg, &q->blkg_list, q_node) {
1524                 if (blkg->pd[pol->plid]) {
1525                         if (pol->pd_offline_fn)
1526                                 pol->pd_offline_fn(blkg->pd[pol->plid]);
1527                         pol->pd_free_fn(blkg->pd[pol->plid]);
1528                         blkg->pd[pol->plid] = NULL;
1529                 }
1530         }
1531
1532         spin_unlock_irq(q->queue_lock);
1533
1534         if (q->mq_ops)
1535                 blk_mq_unfreeze_queue(q);
1536         else
1537                 blk_queue_bypass_end(q);
1538 }
1539 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1540
1541 /**
1542  * blkcg_policy_register - register a blkcg policy
1543  * @pol: blkcg policy to register
1544  *
1545  * Register @pol with blkcg core.  Might sleep and @pol may be modified on
1546  * successful registration.  Returns 0 on success and -errno on failure.
1547  */
1548 int blkcg_policy_register(struct blkcg_policy *pol)
1549 {
1550         struct blkcg *blkcg;
1551         int i, ret;
1552
1553         mutex_lock(&blkcg_pol_register_mutex);
1554         mutex_lock(&blkcg_pol_mutex);
1555
1556         /* find an empty slot */
1557         ret = -ENOSPC;
1558         for (i = 0; i < BLKCG_MAX_POLS; i++)
1559                 if (!blkcg_policy[i])
1560                         break;
1561         if (i >= BLKCG_MAX_POLS) {
1562                 pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
1563                 goto err_unlock;
1564         }
1565
1566         /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
1567         if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
1568                 (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
1569                 goto err_unlock;
1570
1571         /* register @pol */
1572         pol->plid = i;
1573         blkcg_policy[pol->plid] = pol;
1574
1575         /* allocate and install cpd's */
1576         if (pol->cpd_alloc_fn) {
1577                 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1578                         struct blkcg_policy_data *cpd;
1579
1580                         cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1581                         if (!cpd)
1582                                 goto err_free_cpds;
1583
1584                         blkcg->cpd[pol->plid] = cpd;
1585                         cpd->blkcg = blkcg;
1586                         cpd->plid = pol->plid;
1587                         pol->cpd_init_fn(cpd);
1588                 }
1589         }
1590
1591         mutex_unlock(&blkcg_pol_mutex);
1592
1593         /* everything is in place, add intf files for the new policy */
1594         if (pol->dfl_cftypes)
1595                 WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
1596                                                pol->dfl_cftypes));
1597         if (pol->legacy_cftypes)
1598                 WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
1599                                                   pol->legacy_cftypes));
1600         mutex_unlock(&blkcg_pol_register_mutex);
1601         return 0;
1602
1603 err_free_cpds:
1604         if (pol->cpd_free_fn) {
1605                 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1606                         if (blkcg->cpd[pol->plid]) {
1607                                 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1608                                 blkcg->cpd[pol->plid] = NULL;
1609                         }
1610                 }
1611         }
1612         blkcg_policy[pol->plid] = NULL;
1613 err_unlock:
1614         mutex_unlock(&blkcg_pol_mutex);
1615         mutex_unlock(&blkcg_pol_register_mutex);
1616         return ret;
1617 }
1618 EXPORT_SYMBOL_GPL(blkcg_policy_register);
1619
1620 /**
1621  * blkcg_policy_unregister - unregister a blkcg policy
1622  * @pol: blkcg policy to unregister
1623  *
1624  * Undo blkcg_policy_register(@pol).  Might sleep.
1625  */
1626 void blkcg_policy_unregister(struct blkcg_policy *pol)
1627 {
1628         struct blkcg *blkcg;
1629
1630         mutex_lock(&blkcg_pol_register_mutex);
1631
1632         if (WARN_ON(blkcg_policy[pol->plid] != pol))
1633                 goto out_unlock;
1634
1635         /* kill the intf files first */
1636         if (pol->dfl_cftypes)
1637                 cgroup_rm_cftypes(pol->dfl_cftypes);
1638         if (pol->legacy_cftypes)
1639                 cgroup_rm_cftypes(pol->legacy_cftypes);
1640
1641         /* remove cpds and unregister */
1642         mutex_lock(&blkcg_pol_mutex);
1643
1644         if (pol->cpd_free_fn) {
1645                 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1646                         if (blkcg->cpd[pol->plid]) {
1647                                 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1648                                 blkcg->cpd[pol->plid] = NULL;
1649                         }
1650                 }
1651         }
1652         blkcg_policy[pol->plid] = NULL;
1653
1654         mutex_unlock(&blkcg_pol_mutex);
1655 out_unlock:
1656         mutex_unlock(&blkcg_pol_register_mutex);
1657 }
1658 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1659
1660 /*
1661  * Scale the accumulated delay based on how long it has been since we updated
1662  * the delay.  We only call this when we are adding delay, in case it's been a
1663  * while since we added delay, and when we are checking to see if we need to
1664  * delay a task, to account for any delays that may have occurred.
1665  */
1666 static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
1667 {
1668         u64 old = atomic64_read(&blkg->delay_start);
1669
1670         /*
1671          * We only want to scale down every second.  The idea here is that we
1672          * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
1673          * time window.  We only want to throttle tasks for recent delay that
1674          * has occurred, in 1 second time windows since that's the maximum
1675          * things can be throttled.  We save the current delay window in
1676          * blkg->last_delay so we know what amount is still left to be charged
1677          * to the blkg from this point onward.  blkg->last_use keeps track of
1678          * the use_delay counter.  The idea is if we're unthrottling the blkg we
1679          * are ok with whatever is happening now, and we can take away more of
1680          * the accumulated delay as we've already throttled enough that
1681          * everybody is happy with their IO latencies.
1682          */
1683         if (time_before64(old + NSEC_PER_SEC, now) &&
1684             atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
1685                 u64 cur = atomic64_read(&blkg->delay_nsec);
1686                 u64 sub = min_t(u64, blkg->last_delay, now - old);
1687                 int cur_use = atomic_read(&blkg->use_delay);
1688
1689                 /*
1690                  * We've been unthrottled, subtract a larger chunk of our
1691                  * accumulated delay.
1692                  */
1693                 if (cur_use < blkg->last_use)
1694                         sub = max_t(u64, sub, blkg->last_delay >> 1);
1695
1696                 /*
1697                  * This shouldn't happen, but handle it anyway.  Our delay_nsec
1698                  * should only ever be growing except here where we subtract out
1699                  * min(last_delay, 1 second), but lord knows bugs happen and I'd
1700                  * rather not end up with negative numbers.
1701                  */
1702                 if (unlikely(cur < sub)) {
1703                         atomic64_set(&blkg->delay_nsec, 0);
1704                         blkg->last_delay = 0;
1705                 } else {
1706                         atomic64_sub(sub, &blkg->delay_nsec);
1707                         blkg->last_delay = cur - sub;
1708                 }
1709                 blkg->last_use = cur_use;
1710         }
1711 }
1712
1713 /*
1714  * This is called when we want to actually walk up the hierarchy and check to
1715  * see if we need to throttle, and then actually throttle if there is some
1716  * accumulated delay.  This should only be called upon return to user space so
1717  * we're not holding some lock that would induce a priority inversion.
1718  */
1719 static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1720 {
1721         u64 now = ktime_to_ns(ktime_get());
1722         u64 exp;
1723         u64 delay_nsec = 0;
1724         int tok;
1725
1726         while (blkg->parent) {
1727                 if (atomic_read(&blkg->use_delay)) {
1728                         blkcg_scale_delay(blkg, now);
1729                         delay_nsec = max_t(u64, delay_nsec,
1730                                            atomic64_read(&blkg->delay_nsec));
1731                 }
1732                 blkg = blkg->parent;
1733         }
1734
1735         if (!delay_nsec)
1736                 return;
1737
1738         /*
1739          * Let's not sleep for all eternity if we've amassed a huge delay.
1740          * Swapping or metadata IO can accumulate 10's of seconds worth of
1741          * delay, and we want userspace to be able to do _something_ so cap the
1742          * delays at 1 second.  If there's 10's of seconds worth of delay then
1743          * the tasks will be delayed for 1 second for every syscall.
1744          */
1745         delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1746
1747         /*
1748          * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
1749          * that hasn't landed upstream yet.  Once that stuff is in place we need
1750          * to do a psi_memstall_enter/leave if memdelay is set.
1751          */
1752
1753         exp = ktime_add_ns(now, delay_nsec);
1754         tok = io_schedule_prepare();
1755         do {
1756                 __set_current_state(TASK_KILLABLE);
1757                 if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
1758                         break;
1759         } while (!fatal_signal_pending(current));
1760         io_schedule_finish(tok);
1761 }
1762
1763 /**
1764  * blkcg_maybe_throttle_current - throttle the current task if it has been marked
1765  *
1766  * This is only called if we've been marked with set_notify_resume().  Obviously
1767  * we can be set_notify_resume() for reasons other than blkcg throttling, so we
1768  * check to see if current->throttle_queue is set and if not this doesn't do
1769  * anything.  This should only ever be called by the resume code, it's not meant
1770  * to be called by people willy-nilly as it will actually do the work to
1771  * throttle the task if it is setup for throttling.
1772  */
1773 void blkcg_maybe_throttle_current(void)
1774 {
1775         struct request_queue *q = current->throttle_queue;
1776         struct cgroup_subsys_state *css;
1777         struct blkcg *blkcg;
1778         struct blkcg_gq *blkg;
1779         bool use_memdelay = current->use_memdelay;
1780
1781         if (!q)
1782                 return;
1783
1784         current->throttle_queue = NULL;
1785         current->use_memdelay = false;
1786
1787         rcu_read_lock();
1788         css = kthread_blkcg();
1789         if (css)
1790                 blkcg = css_to_blkcg(css);
1791         else
1792                 blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
1793
1794         if (!blkcg)
1795                 goto out;
1796         blkg = blkg_lookup(blkcg, q);
1797         if (!blkg)
1798                 goto out;
1799         if (!blkg_tryget(blkg))
1800                 goto out;
1801         rcu_read_unlock();
1802
1803         blkcg_maybe_throttle_blkg(blkg, use_memdelay);
1804         blkg_put(blkg);
1805         blk_put_queue(q);
1806         return;
1807 out:
1808         rcu_read_unlock();
1809         blk_put_queue(q);
1810 }
1811 EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
1812
1813 /**
1814  * blkcg_schedule_throttle - this task needs to check for throttling
1815  * @q - the request queue IO was submitted on
1816  * @use_memdelay - do we charge this to memory delay for PSI
1817  *
1818  * This is called by the IO controller when we know there's delay accumulated
1819  * for the blkg for this task.  We do not pass the blkg because there are places
1820  * we call this that may not have that information, the swapping code for
1821  * instance will only have a request_queue at that point.  This set's the
1822  * notify_resume for the task to check and see if it requires throttling before
1823  * returning to user space.
1824  *
1825  * We will only schedule once per syscall.  You can call this over and over
1826  * again and it will only do the check once upon return to user space, and only
1827  * throttle once.  If the task needs to be throttled again it'll need to be
1828  * re-set at the next time we see the task.
1829  */
1830 void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
1831 {
1832         if (unlikely(current->flags & PF_KTHREAD))
1833                 return;
1834
1835         if (!blk_get_queue(q))
1836                 return;
1837
1838         if (current->throttle_queue)
1839                 blk_put_queue(current->throttle_queue);
1840         current->throttle_queue = q;
1841         if (use_memdelay)
1842                 current->use_memdelay = use_memdelay;
1843         set_notify_resume(current);
1844 }
1845 EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
1846
1847 /**
1848  * blkcg_add_delay - add delay to this blkg
1849  * @now - the current time in nanoseconds
1850  * @delta - how many nanoseconds of delay to add
1851  *
1852  * Charge @delta to the blkg's current delay accumulation.  This is used to
1853  * throttle tasks if an IO controller thinks we need more throttling.
1854  */
1855 void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1856 {
1857         blkcg_scale_delay(blkg, now);
1858         atomic64_add(delta, &blkg->delay_nsec);
1859 }
1860 EXPORT_SYMBOL_GPL(blkcg_add_delay);
1861
1862 module_param(blkcg_debug_stats, bool, 0644);
1863 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");