9ba5345ebc9a7c13a59b12ce1a0003b54c2df253
[sfrench/cifs-2.6.git] / block / cfq-iosched.c
1 /*
2  *  CFQ, or complete fairness queueing, disk scheduler.
3  *
4  *  Based on ideas from a previously unfinished io
5  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6  *
7  *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8  */
9 #include <linux/module.h>
10 #include <linux/slab.h>
11 #include <linux/sched/clock.h>
12 #include <linux/blkdev.h>
13 #include <linux/elevator.h>
14 #include <linux/ktime.h>
15 #include <linux/rbtree.h>
16 #include <linux/ioprio.h>
17 #include <linux/blktrace_api.h>
18 #include <linux/blk-cgroup.h>
19 #include "blk.h"
20 #include "blk-wbt.h"
21
22 /*
23  * tunables
24  */
25 /* max queue in one round of service */
26 static const int cfq_quantum = 8;
27 static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
28 /* maximum backwards seek, in KiB */
29 static const int cfq_back_max = 16 * 1024;
30 /* penalty of a backwards seek */
31 static const int cfq_back_penalty = 2;
32 static const u64 cfq_slice_sync = NSEC_PER_SEC / 10;
33 static u64 cfq_slice_async = NSEC_PER_SEC / 25;
34 static const int cfq_slice_async_rq = 2;
35 static u64 cfq_slice_idle = NSEC_PER_SEC / 125;
36 static u64 cfq_group_idle = NSEC_PER_SEC / 125;
37 static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */
38 static const int cfq_hist_divisor = 4;
39
40 /*
41  * offset from end of queue service tree for idle class
42  */
43 #define CFQ_IDLE_DELAY          (NSEC_PER_SEC / 5)
44 /* offset from end of group service tree under time slice mode */
45 #define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5)
46 /* offset from end of group service under IOPS mode */
47 #define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5)
48
49 /*
50  * below this threshold, we consider thinktime immediate
51  */
52 #define CFQ_MIN_TT              (2 * NSEC_PER_SEC / HZ)
53
54 #define CFQ_SLICE_SCALE         (5)
55 #define CFQ_HW_QUEUE_MIN        (5)
56 #define CFQ_SERVICE_SHIFT       12
57
58 #define CFQQ_SEEK_THR           (sector_t)(8 * 100)
59 #define CFQQ_CLOSE_THR          (sector_t)(8 * 1024)
60 #define CFQQ_SECT_THR_NONROT    (sector_t)(2 * 32)
61 #define CFQQ_SEEKY(cfqq)        (hweight32(cfqq->seek_history) > 32/8)
62
63 #define RQ_CIC(rq)              icq_to_cic((rq)->elv.icq)
64 #define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elv.priv[0])
65 #define RQ_CFQG(rq)             (struct cfq_group *) ((rq)->elv.priv[1])
66
67 static struct kmem_cache *cfq_pool;
68
69 #define CFQ_PRIO_LISTS          IOPRIO_BE_NR
70 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
71 #define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
72
73 #define sample_valid(samples)   ((samples) > 80)
74 #define rb_entry_cfqg(node)     rb_entry((node), struct cfq_group, rb_node)
75
76 /* blkio-related constants */
77 #define CFQ_WEIGHT_LEGACY_MIN   10
78 #define CFQ_WEIGHT_LEGACY_DFL   500
79 #define CFQ_WEIGHT_LEGACY_MAX   1000
80
81 struct cfq_ttime {
82         u64 last_end_request;
83
84         u64 ttime_total;
85         u64 ttime_mean;
86         unsigned long ttime_samples;
87 };
88
89 /*
90  * Most of our rbtree usage is for sorting with min extraction, so
91  * if we cache the leftmost node we don't have to walk down the tree
92  * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
93  * move this into the elevator for the rq sorting as well.
94  */
95 struct cfq_rb_root {
96         struct rb_root_cached rb;
97         unsigned count;
98         u64 min_vdisktime;
99         struct cfq_ttime ttime;
100 };
101 #define CFQ_RB_ROOT     (struct cfq_rb_root) { .rb = RB_ROOT_CACHED, \
102                         .ttime = {.last_end_request = ktime_get_ns(),},}
103
104 /*
105  * Per process-grouping structure
106  */
107 struct cfq_queue {
108         /* reference count */
109         int ref;
110         /* various state flags, see below */
111         unsigned int flags;
112         /* parent cfq_data */
113         struct cfq_data *cfqd;
114         /* service_tree member */
115         struct rb_node rb_node;
116         /* service_tree key */
117         u64 rb_key;
118         /* prio tree member */
119         struct rb_node p_node;
120         /* prio tree root we belong to, if any */
121         struct rb_root *p_root;
122         /* sorted list of pending requests */
123         struct rb_root sort_list;
124         /* if fifo isn't expired, next request to serve */
125         struct request *next_rq;
126         /* requests queued in sort_list */
127         int queued[2];
128         /* currently allocated requests */
129         int allocated[2];
130         /* fifo list of requests in sort_list */
131         struct list_head fifo;
132
133         /* time when queue got scheduled in to dispatch first request. */
134         u64 dispatch_start;
135         u64 allocated_slice;
136         u64 slice_dispatch;
137         /* time when first request from queue completed and slice started. */
138         u64 slice_start;
139         u64 slice_end;
140         s64 slice_resid;
141
142         /* pending priority requests */
143         int prio_pending;
144         /* number of requests that are on the dispatch list or inside driver */
145         int dispatched;
146
147         /* io prio of this group */
148         unsigned short ioprio, org_ioprio;
149         unsigned short ioprio_class, org_ioprio_class;
150
151         pid_t pid;
152
153         u32 seek_history;
154         sector_t last_request_pos;
155
156         struct cfq_rb_root *service_tree;
157         struct cfq_queue *new_cfqq;
158         struct cfq_group *cfqg;
159         /* Number of sectors dispatched from queue in single dispatch round */
160         unsigned long nr_sectors;
161 };
162
163 /*
164  * First index in the service_trees.
165  * IDLE is handled separately, so it has negative index
166  */
167 enum wl_class_t {
168         BE_WORKLOAD = 0,
169         RT_WORKLOAD = 1,
170         IDLE_WORKLOAD = 2,
171         CFQ_PRIO_NR,
172 };
173
174 /*
175  * Second index in the service_trees.
176  */
177 enum wl_type_t {
178         ASYNC_WORKLOAD = 0,
179         SYNC_NOIDLE_WORKLOAD = 1,
180         SYNC_WORKLOAD = 2
181 };
182
183 struct cfqg_stats {
184 #ifdef CONFIG_CFQ_GROUP_IOSCHED
185         /* number of ios merged */
186         struct blkg_rwstat              merged;
187         /* total time spent on device in ns, may not be accurate w/ queueing */
188         struct blkg_rwstat              service_time;
189         /* total time spent waiting in scheduler queue in ns */
190         struct blkg_rwstat              wait_time;
191         /* number of IOs queued up */
192         struct blkg_rwstat              queued;
193         /* total disk time and nr sectors dispatched by this group */
194         struct blkg_stat                time;
195 #ifdef CONFIG_DEBUG_BLK_CGROUP
196         /* time not charged to this cgroup */
197         struct blkg_stat                unaccounted_time;
198         /* sum of number of ios queued across all samples */
199         struct blkg_stat                avg_queue_size_sum;
200         /* count of samples taken for average */
201         struct blkg_stat                avg_queue_size_samples;
202         /* how many times this group has been removed from service tree */
203         struct blkg_stat                dequeue;
204         /* total time spent waiting for it to be assigned a timeslice. */
205         struct blkg_stat                group_wait_time;
206         /* time spent idling for this blkcg_gq */
207         struct blkg_stat                idle_time;
208         /* total time with empty current active q with other requests queued */
209         struct blkg_stat                empty_time;
210         /* fields after this shouldn't be cleared on stat reset */
211         uint64_t                        start_group_wait_time;
212         uint64_t                        start_idle_time;
213         uint64_t                        start_empty_time;
214         uint16_t                        flags;
215 #endif  /* CONFIG_DEBUG_BLK_CGROUP */
216 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
217 };
218
219 /* Per-cgroup data */
220 struct cfq_group_data {
221         /* must be the first member */
222         struct blkcg_policy_data cpd;
223
224         unsigned int weight;
225         unsigned int leaf_weight;
226 };
227
228 /* This is per cgroup per device grouping structure */
229 struct cfq_group {
230         /* must be the first member */
231         struct blkg_policy_data pd;
232
233         /* group service_tree member */
234         struct rb_node rb_node;
235
236         /* group service_tree key */
237         u64 vdisktime;
238
239         /*
240          * The number of active cfqgs and sum of their weights under this
241          * cfqg.  This covers this cfqg's leaf_weight and all children's
242          * weights, but does not cover weights of further descendants.
243          *
244          * If a cfqg is on the service tree, it's active.  An active cfqg
245          * also activates its parent and contributes to the children_weight
246          * of the parent.
247          */
248         int nr_active;
249         unsigned int children_weight;
250
251         /*
252          * vfraction is the fraction of vdisktime that the tasks in this
253          * cfqg are entitled to.  This is determined by compounding the
254          * ratios walking up from this cfqg to the root.
255          *
256          * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
257          * vfractions on a service tree is approximately 1.  The sum may
258          * deviate a bit due to rounding errors and fluctuations caused by
259          * cfqgs entering and leaving the service tree.
260          */
261         unsigned int vfraction;
262
263         /*
264          * There are two weights - (internal) weight is the weight of this
265          * cfqg against the sibling cfqgs.  leaf_weight is the wight of
266          * this cfqg against the child cfqgs.  For the root cfqg, both
267          * weights are kept in sync for backward compatibility.
268          */
269         unsigned int weight;
270         unsigned int new_weight;
271         unsigned int dev_weight;
272
273         unsigned int leaf_weight;
274         unsigned int new_leaf_weight;
275         unsigned int dev_leaf_weight;
276
277         /* number of cfqq currently on this group */
278         int nr_cfqq;
279
280         /*
281          * Per group busy queues average. Useful for workload slice calc. We
282          * create the array for each prio class but at run time it is used
283          * only for RT and BE class and slot for IDLE class remains unused.
284          * This is primarily done to avoid confusion and a gcc warning.
285          */
286         unsigned int busy_queues_avg[CFQ_PRIO_NR];
287         /*
288          * rr lists of queues with requests. We maintain service trees for
289          * RT and BE classes. These trees are subdivided in subclasses
290          * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
291          * class there is no subclassification and all the cfq queues go on
292          * a single tree service_tree_idle.
293          * Counts are embedded in the cfq_rb_root
294          */
295         struct cfq_rb_root service_trees[2][3];
296         struct cfq_rb_root service_tree_idle;
297
298         u64 saved_wl_slice;
299         enum wl_type_t saved_wl_type;
300         enum wl_class_t saved_wl_class;
301
302         /* number of requests that are on the dispatch list or inside driver */
303         int dispatched;
304         struct cfq_ttime ttime;
305         struct cfqg_stats stats;        /* stats for this cfqg */
306
307         /* async queue for each priority case */
308         struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
309         struct cfq_queue *async_idle_cfqq;
310
311 };
312
313 struct cfq_io_cq {
314         struct io_cq            icq;            /* must be the first member */
315         struct cfq_queue        *cfqq[2];
316         struct cfq_ttime        ttime;
317         int                     ioprio;         /* the current ioprio */
318 #ifdef CONFIG_CFQ_GROUP_IOSCHED
319         uint64_t                blkcg_serial_nr; /* the current blkcg serial */
320 #endif
321 };
322
323 /*
324  * Per block device queue structure
325  */
326 struct cfq_data {
327         struct request_queue *queue;
328         /* Root service tree for cfq_groups */
329         struct cfq_rb_root grp_service_tree;
330         struct cfq_group *root_group;
331
332         /*
333          * The priority currently being served
334          */
335         enum wl_class_t serving_wl_class;
336         enum wl_type_t serving_wl_type;
337         u64 workload_expires;
338         struct cfq_group *serving_group;
339
340         /*
341          * Each priority tree is sorted by next_request position.  These
342          * trees are used when determining if two or more queues are
343          * interleaving requests (see cfq_close_cooperator).
344          */
345         struct rb_root prio_trees[CFQ_PRIO_LISTS];
346
347         unsigned int busy_queues;
348         unsigned int busy_sync_queues;
349
350         int rq_in_driver;
351         int rq_in_flight[2];
352
353         /*
354          * queue-depth detection
355          */
356         int rq_queued;
357         int hw_tag;
358         /*
359          * hw_tag can be
360          * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
361          *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
362          *  0 => no NCQ
363          */
364         int hw_tag_est_depth;
365         unsigned int hw_tag_samples;
366
367         /*
368          * idle window management
369          */
370         struct hrtimer idle_slice_timer;
371         struct work_struct unplug_work;
372
373         struct cfq_queue *active_queue;
374         struct cfq_io_cq *active_cic;
375
376         sector_t last_position;
377
378         /*
379          * tunables, see top of file
380          */
381         unsigned int cfq_quantum;
382         unsigned int cfq_back_penalty;
383         unsigned int cfq_back_max;
384         unsigned int cfq_slice_async_rq;
385         unsigned int cfq_latency;
386         u64 cfq_fifo_expire[2];
387         u64 cfq_slice[2];
388         u64 cfq_slice_idle;
389         u64 cfq_group_idle;
390         u64 cfq_target_latency;
391
392         /*
393          * Fallback dummy cfqq for extreme OOM conditions
394          */
395         struct cfq_queue oom_cfqq;
396
397         u64 last_delayed_sync;
398 };
399
400 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
401 static void cfq_put_queue(struct cfq_queue *cfqq);
402
403 static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
404                                             enum wl_class_t class,
405                                             enum wl_type_t type)
406 {
407         if (!cfqg)
408                 return NULL;
409
410         if (class == IDLE_WORKLOAD)
411                 return &cfqg->service_tree_idle;
412
413         return &cfqg->service_trees[class][type];
414 }
415
416 enum cfqq_state_flags {
417         CFQ_CFQQ_FLAG_on_rr = 0,        /* on round-robin busy list */
418         CFQ_CFQQ_FLAG_wait_request,     /* waiting for a request */
419         CFQ_CFQQ_FLAG_must_dispatch,    /* must be allowed a dispatch */
420         CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
421         CFQ_CFQQ_FLAG_fifo_expire,      /* FIFO checked in this slice */
422         CFQ_CFQQ_FLAG_idle_window,      /* slice idling enabled */
423         CFQ_CFQQ_FLAG_prio_changed,     /* task priority has changed */
424         CFQ_CFQQ_FLAG_slice_new,        /* no requests dispatched in slice */
425         CFQ_CFQQ_FLAG_sync,             /* synchronous queue */
426         CFQ_CFQQ_FLAG_coop,             /* cfqq is shared */
427         CFQ_CFQQ_FLAG_split_coop,       /* shared cfqq will be splitted */
428         CFQ_CFQQ_FLAG_deep,             /* sync cfqq experienced large depth */
429         CFQ_CFQQ_FLAG_wait_busy,        /* Waiting for next request */
430 };
431
432 #define CFQ_CFQQ_FNS(name)                                              \
433 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
434 {                                                                       \
435         (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);                   \
436 }                                                                       \
437 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
438 {                                                                       \
439         (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                  \
440 }                                                                       \
441 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
442 {                                                                       \
443         return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;      \
444 }
445
446 CFQ_CFQQ_FNS(on_rr);
447 CFQ_CFQQ_FNS(wait_request);
448 CFQ_CFQQ_FNS(must_dispatch);
449 CFQ_CFQQ_FNS(must_alloc_slice);
450 CFQ_CFQQ_FNS(fifo_expire);
451 CFQ_CFQQ_FNS(idle_window);
452 CFQ_CFQQ_FNS(prio_changed);
453 CFQ_CFQQ_FNS(slice_new);
454 CFQ_CFQQ_FNS(sync);
455 CFQ_CFQQ_FNS(coop);
456 CFQ_CFQQ_FNS(split_coop);
457 CFQ_CFQQ_FNS(deep);
458 CFQ_CFQQ_FNS(wait_busy);
459 #undef CFQ_CFQQ_FNS
460
461 #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
462
463 /* cfqg stats flags */
464 enum cfqg_stats_flags {
465         CFQG_stats_waiting = 0,
466         CFQG_stats_idling,
467         CFQG_stats_empty,
468 };
469
470 #define CFQG_FLAG_FNS(name)                                             \
471 static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)     \
472 {                                                                       \
473         stats->flags |= (1 << CFQG_stats_##name);                       \
474 }                                                                       \
475 static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)    \
476 {                                                                       \
477         stats->flags &= ~(1 << CFQG_stats_##name);                      \
478 }                                                                       \
479 static inline int cfqg_stats_##name(struct cfqg_stats *stats)           \
480 {                                                                       \
481         return (stats->flags & (1 << CFQG_stats_##name)) != 0;          \
482 }                                                                       \
483
484 CFQG_FLAG_FNS(waiting)
485 CFQG_FLAG_FNS(idling)
486 CFQG_FLAG_FNS(empty)
487 #undef CFQG_FLAG_FNS
488
489 /* This should be called with the queue_lock held. */
490 static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
491 {
492         unsigned long long now;
493
494         if (!cfqg_stats_waiting(stats))
495                 return;
496
497         now = sched_clock();
498         if (time_after64(now, stats->start_group_wait_time))
499                 blkg_stat_add(&stats->group_wait_time,
500                               now - stats->start_group_wait_time);
501         cfqg_stats_clear_waiting(stats);
502 }
503
504 /* This should be called with the queue_lock held. */
505 static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
506                                                  struct cfq_group *curr_cfqg)
507 {
508         struct cfqg_stats *stats = &cfqg->stats;
509
510         if (cfqg_stats_waiting(stats))
511                 return;
512         if (cfqg == curr_cfqg)
513                 return;
514         stats->start_group_wait_time = sched_clock();
515         cfqg_stats_mark_waiting(stats);
516 }
517
518 /* This should be called with the queue_lock held. */
519 static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
520 {
521         unsigned long long now;
522
523         if (!cfqg_stats_empty(stats))
524                 return;
525
526         now = sched_clock();
527         if (time_after64(now, stats->start_empty_time))
528                 blkg_stat_add(&stats->empty_time,
529                               now - stats->start_empty_time);
530         cfqg_stats_clear_empty(stats);
531 }
532
533 static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
534 {
535         blkg_stat_add(&cfqg->stats.dequeue, 1);
536 }
537
538 static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
539 {
540         struct cfqg_stats *stats = &cfqg->stats;
541
542         if (blkg_rwstat_total(&stats->queued))
543                 return;
544
545         /*
546          * group is already marked empty. This can happen if cfqq got new
547          * request in parent group and moved to this group while being added
548          * to service tree. Just ignore the event and move on.
549          */
550         if (cfqg_stats_empty(stats))
551                 return;
552
553         stats->start_empty_time = sched_clock();
554         cfqg_stats_mark_empty(stats);
555 }
556
557 static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
558 {
559         struct cfqg_stats *stats = &cfqg->stats;
560
561         if (cfqg_stats_idling(stats)) {
562                 unsigned long long now = sched_clock();
563
564                 if (time_after64(now, stats->start_idle_time))
565                         blkg_stat_add(&stats->idle_time,
566                                       now - stats->start_idle_time);
567                 cfqg_stats_clear_idling(stats);
568         }
569 }
570
571 static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
572 {
573         struct cfqg_stats *stats = &cfqg->stats;
574
575         BUG_ON(cfqg_stats_idling(stats));
576
577         stats->start_idle_time = sched_clock();
578         cfqg_stats_mark_idling(stats);
579 }
580
581 static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
582 {
583         struct cfqg_stats *stats = &cfqg->stats;
584
585         blkg_stat_add(&stats->avg_queue_size_sum,
586                       blkg_rwstat_total(&stats->queued));
587         blkg_stat_add(&stats->avg_queue_size_samples, 1);
588         cfqg_stats_update_group_wait_time(stats);
589 }
590
591 #else   /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
592
593 static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
594 static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
595 static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
596 static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
597 static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
598 static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
599 static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
600
601 #endif  /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
602
603 #ifdef CONFIG_CFQ_GROUP_IOSCHED
604
605 static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
606 {
607         return pd ? container_of(pd, struct cfq_group, pd) : NULL;
608 }
609
610 static struct cfq_group_data
611 *cpd_to_cfqgd(struct blkcg_policy_data *cpd)
612 {
613         return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
614 }
615
616 static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
617 {
618         return pd_to_blkg(&cfqg->pd);
619 }
620
621 static struct blkcg_policy blkcg_policy_cfq;
622
623 static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
624 {
625         return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
626 }
627
628 static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
629 {
630         return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
631 }
632
633 static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
634 {
635         struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
636
637         return pblkg ? blkg_to_cfqg(pblkg) : NULL;
638 }
639
640 static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
641                                       struct cfq_group *ancestor)
642 {
643         return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup,
644                                     cfqg_to_blkg(ancestor)->blkcg->css.cgroup);
645 }
646
647 static inline void cfqg_get(struct cfq_group *cfqg)
648 {
649         return blkg_get(cfqg_to_blkg(cfqg));
650 }
651
652 static inline void cfqg_put(struct cfq_group *cfqg)
653 {
654         return blkg_put(cfqg_to_blkg(cfqg));
655 }
656
657 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  do {                    \
658         blk_add_cgroup_trace_msg((cfqd)->queue,                         \
659                         cfqg_to_blkg((cfqq)->cfqg)->blkcg,              \
660                         "cfq%d%c%c " fmt, (cfqq)->pid,                  \
661                         cfq_cfqq_sync((cfqq)) ? 'S' : 'A',              \
662                         cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
663                           ##args);                                      \
664 } while (0)
665
666 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)  do {                    \
667         blk_add_cgroup_trace_msg((cfqd)->queue,                         \
668                         cfqg_to_blkg(cfqg)->blkcg, fmt, ##args);        \
669 } while (0)
670
671 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
672                                             struct cfq_group *curr_cfqg,
673                                             unsigned int op)
674 {
675         blkg_rwstat_add(&cfqg->stats.queued, op, 1);
676         cfqg_stats_end_empty_time(&cfqg->stats);
677         cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
678 }
679
680 static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
681                         uint64_t time, unsigned long unaccounted_time)
682 {
683         blkg_stat_add(&cfqg->stats.time, time);
684 #ifdef CONFIG_DEBUG_BLK_CGROUP
685         blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
686 #endif
687 }
688
689 static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
690                                                unsigned int op)
691 {
692         blkg_rwstat_add(&cfqg->stats.queued, op, -1);
693 }
694
695 static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
696                                                unsigned int op)
697 {
698         blkg_rwstat_add(&cfqg->stats.merged, op, 1);
699 }
700
701 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
702                         uint64_t start_time, uint64_t io_start_time,
703                         unsigned int op)
704 {
705         struct cfqg_stats *stats = &cfqg->stats;
706         unsigned long long now = sched_clock();
707
708         if (time_after64(now, io_start_time))
709                 blkg_rwstat_add(&stats->service_time, op, now - io_start_time);
710         if (time_after64(io_start_time, start_time))
711                 blkg_rwstat_add(&stats->wait_time, op,
712                                 io_start_time - start_time);
713 }
714
715 /* @stats = 0 */
716 static void cfqg_stats_reset(struct cfqg_stats *stats)
717 {
718         /* queued stats shouldn't be cleared */
719         blkg_rwstat_reset(&stats->merged);
720         blkg_rwstat_reset(&stats->service_time);
721         blkg_rwstat_reset(&stats->wait_time);
722         blkg_stat_reset(&stats->time);
723 #ifdef CONFIG_DEBUG_BLK_CGROUP
724         blkg_stat_reset(&stats->unaccounted_time);
725         blkg_stat_reset(&stats->avg_queue_size_sum);
726         blkg_stat_reset(&stats->avg_queue_size_samples);
727         blkg_stat_reset(&stats->dequeue);
728         blkg_stat_reset(&stats->group_wait_time);
729         blkg_stat_reset(&stats->idle_time);
730         blkg_stat_reset(&stats->empty_time);
731 #endif
732 }
733
734 /* @to += @from */
735 static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
736 {
737         /* queued stats shouldn't be cleared */
738         blkg_rwstat_add_aux(&to->merged, &from->merged);
739         blkg_rwstat_add_aux(&to->service_time, &from->service_time);
740         blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
741         blkg_stat_add_aux(&from->time, &from->time);
742 #ifdef CONFIG_DEBUG_BLK_CGROUP
743         blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
744         blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
745         blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
746         blkg_stat_add_aux(&to->dequeue, &from->dequeue);
747         blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
748         blkg_stat_add_aux(&to->idle_time, &from->idle_time);
749         blkg_stat_add_aux(&to->empty_time, &from->empty_time);
750 #endif
751 }
752
753 /*
754  * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
755  * recursive stats can still account for the amount used by this cfqg after
756  * it's gone.
757  */
758 static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
759 {
760         struct cfq_group *parent = cfqg_parent(cfqg);
761
762         lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
763
764         if (unlikely(!parent))
765                 return;
766
767         cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
768         cfqg_stats_reset(&cfqg->stats);
769 }
770
771 #else   /* CONFIG_CFQ_GROUP_IOSCHED */
772
773 static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
774 static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
775                                       struct cfq_group *ancestor)
776 {
777         return true;
778 }
779 static inline void cfqg_get(struct cfq_group *cfqg) { }
780 static inline void cfqg_put(struct cfq_group *cfqg) { }
781
782 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
783         blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \
784                         cfq_cfqq_sync((cfqq)) ? 'S' : 'A',              \
785                         cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
786                                 ##args)
787 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)          do {} while (0)
788
789 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
790                         struct cfq_group *curr_cfqg, unsigned int op) { }
791 static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
792                         uint64_t time, unsigned long unaccounted_time) { }
793 static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
794                         unsigned int op) { }
795 static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
796                         unsigned int op) { }
797 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
798                         uint64_t start_time, uint64_t io_start_time,
799                         unsigned int op) { }
800
801 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
802
803 #define cfq_log(cfqd, fmt, args...)     \
804         blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
805
806 /* Traverses through cfq group service trees */
807 #define for_each_cfqg_st(cfqg, i, j, st) \
808         for (i = 0; i <= IDLE_WORKLOAD; i++) \
809                 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
810                         : &cfqg->service_tree_idle; \
811                         (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
812                         (i == IDLE_WORKLOAD && j == 0); \
813                         j++, st = i < IDLE_WORKLOAD ? \
814                         &cfqg->service_trees[i][j]: NULL) \
815
816 static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
817         struct cfq_ttime *ttime, bool group_idle)
818 {
819         u64 slice;
820         if (!sample_valid(ttime->ttime_samples))
821                 return false;
822         if (group_idle)
823                 slice = cfqd->cfq_group_idle;
824         else
825                 slice = cfqd->cfq_slice_idle;
826         return ttime->ttime_mean > slice;
827 }
828
829 static inline bool iops_mode(struct cfq_data *cfqd)
830 {
831         /*
832          * If we are not idling on queues and it is a NCQ drive, parallel
833          * execution of requests is on and measuring time is not possible
834          * in most of the cases until and unless we drive shallower queue
835          * depths and that becomes a performance bottleneck. In such cases
836          * switch to start providing fairness in terms of number of IOs.
837          */
838         if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
839                 return true;
840         else
841                 return false;
842 }
843
844 static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
845 {
846         if (cfq_class_idle(cfqq))
847                 return IDLE_WORKLOAD;
848         if (cfq_class_rt(cfqq))
849                 return RT_WORKLOAD;
850         return BE_WORKLOAD;
851 }
852
853
854 static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
855 {
856         if (!cfq_cfqq_sync(cfqq))
857                 return ASYNC_WORKLOAD;
858         if (!cfq_cfqq_idle_window(cfqq))
859                 return SYNC_NOIDLE_WORKLOAD;
860         return SYNC_WORKLOAD;
861 }
862
863 static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
864                                         struct cfq_data *cfqd,
865                                         struct cfq_group *cfqg)
866 {
867         if (wl_class == IDLE_WORKLOAD)
868                 return cfqg->service_tree_idle.count;
869
870         return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
871                 cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
872                 cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
873 }
874
875 static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
876                                         struct cfq_group *cfqg)
877 {
878         return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
879                 cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
880 }
881
882 static void cfq_dispatch_insert(struct request_queue *, struct request *);
883 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
884                                        struct cfq_io_cq *cic, struct bio *bio);
885
886 static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
887 {
888         /* cic->icq is the first member, %NULL will convert to %NULL */
889         return container_of(icq, struct cfq_io_cq, icq);
890 }
891
892 static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
893                                                struct io_context *ioc)
894 {
895         if (ioc)
896                 return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
897         return NULL;
898 }
899
900 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
901 {
902         return cic->cfqq[is_sync];
903 }
904
905 static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
906                                 bool is_sync)
907 {
908         cic->cfqq[is_sync] = cfqq;
909 }
910
911 static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
912 {
913         return cic->icq.q->elevator->elevator_data;
914 }
915
916 /*
917  * scheduler run of queue, if there are requests pending and no one in the
918  * driver that will restart queueing
919  */
920 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
921 {
922         if (cfqd->busy_queues) {
923                 cfq_log(cfqd, "schedule dispatch");
924                 kblockd_schedule_work(&cfqd->unplug_work);
925         }
926 }
927
928 /*
929  * Scale schedule slice based on io priority. Use the sync time slice only
930  * if a queue is marked sync and has sync io queued. A sync queue with async
931  * io only, should not get full sync slice length.
932  */
933 static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync,
934                                  unsigned short prio)
935 {
936         u64 base_slice = cfqd->cfq_slice[sync];
937         u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE);
938
939         WARN_ON(prio >= IOPRIO_BE_NR);
940
941         return base_slice + (slice * (4 - prio));
942 }
943
944 static inline u64
945 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
946 {
947         return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
948 }
949
950 /**
951  * cfqg_scale_charge - scale disk time charge according to cfqg weight
952  * @charge: disk time being charged
953  * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
954  *
955  * Scale @charge according to @vfraction, which is in range (0, 1].  The
956  * scaling is inversely proportional.
957  *
958  * scaled = charge / vfraction
959  *
960  * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
961  */
962 static inline u64 cfqg_scale_charge(u64 charge,
963                                     unsigned int vfraction)
964 {
965         u64 c = charge << CFQ_SERVICE_SHIFT;    /* make it fixed point */
966
967         /* charge / vfraction */
968         c <<= CFQ_SERVICE_SHIFT;
969         return div_u64(c, vfraction);
970 }
971
972 static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
973 {
974         s64 delta = (s64)(vdisktime - min_vdisktime);
975         if (delta > 0)
976                 min_vdisktime = vdisktime;
977
978         return min_vdisktime;
979 }
980
981 static void update_min_vdisktime(struct cfq_rb_root *st)
982 {
983         if (!RB_EMPTY_ROOT(&st->rb.rb_root)) {
984                 struct cfq_group *cfqg = rb_entry_cfqg(st->rb.rb_leftmost);
985
986                 st->min_vdisktime = max_vdisktime(st->min_vdisktime,
987                                                   cfqg->vdisktime);
988         }
989 }
990
991 /*
992  * get averaged number of queues of RT/BE priority.
993  * average is updated, with a formula that gives more weight to higher numbers,
994  * to quickly follows sudden increases and decrease slowly
995  */
996
997 static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
998                                         struct cfq_group *cfqg, bool rt)
999 {
1000         unsigned min_q, max_q;
1001         unsigned mult  = cfq_hist_divisor - 1;
1002         unsigned round = cfq_hist_divisor / 2;
1003         unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
1004
1005         min_q = min(cfqg->busy_queues_avg[rt], busy);
1006         max_q = max(cfqg->busy_queues_avg[rt], busy);
1007         cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
1008                 cfq_hist_divisor;
1009         return cfqg->busy_queues_avg[rt];
1010 }
1011
1012 static inline u64
1013 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
1014 {
1015         return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
1016 }
1017
1018 static inline u64
1019 cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1020 {
1021         u64 slice = cfq_prio_to_slice(cfqd, cfqq);
1022         if (cfqd->cfq_latency) {
1023                 /*
1024                  * interested queues (we consider only the ones with the same
1025                  * priority class in the cfq group)
1026                  */
1027                 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
1028                                                 cfq_class_rt(cfqq));
1029                 u64 sync_slice = cfqd->cfq_slice[1];
1030                 u64 expect_latency = sync_slice * iq;
1031                 u64 group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
1032
1033                 if (expect_latency > group_slice) {
1034                         u64 base_low_slice = 2 * cfqd->cfq_slice_idle;
1035                         u64 low_slice;
1036
1037                         /* scale low_slice according to IO priority
1038                          * and sync vs async */
1039                         low_slice = div64_u64(base_low_slice*slice, sync_slice);
1040                         low_slice = min(slice, low_slice);
1041                         /* the adapted slice value is scaled to fit all iqs
1042                          * into the target latency */
1043                         slice = div64_u64(slice*group_slice, expect_latency);
1044                         slice = max(slice, low_slice);
1045                 }
1046         }
1047         return slice;
1048 }
1049
1050 static inline void
1051 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1052 {
1053         u64 slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
1054         u64 now = ktime_get_ns();
1055
1056         cfqq->slice_start = now;
1057         cfqq->slice_end = now + slice;
1058         cfqq->allocated_slice = slice;
1059         cfq_log_cfqq(cfqd, cfqq, "set_slice=%llu", cfqq->slice_end - now);
1060 }
1061
1062 /*
1063  * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
1064  * isn't valid until the first request from the dispatch is activated
1065  * and the slice time set.
1066  */
1067 static inline bool cfq_slice_used(struct cfq_queue *cfqq)
1068 {
1069         if (cfq_cfqq_slice_new(cfqq))
1070                 return false;
1071         if (ktime_get_ns() < cfqq->slice_end)
1072                 return false;
1073
1074         return true;
1075 }
1076
1077 /*
1078  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1079  * We choose the request that is closest to the head right now. Distance
1080  * behind the head is penalized and only allowed to a certain extent.
1081  */
1082 static struct request *
1083 cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
1084 {
1085         sector_t s1, s2, d1 = 0, d2 = 0;
1086         unsigned long back_max;
1087 #define CFQ_RQ1_WRAP    0x01 /* request 1 wraps */
1088 #define CFQ_RQ2_WRAP    0x02 /* request 2 wraps */
1089         unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1090
1091         if (rq1 == NULL || rq1 == rq2)
1092                 return rq2;
1093         if (rq2 == NULL)
1094                 return rq1;
1095
1096         if (rq_is_sync(rq1) != rq_is_sync(rq2))
1097                 return rq_is_sync(rq1) ? rq1 : rq2;
1098
1099         if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
1100                 return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
1101
1102         s1 = blk_rq_pos(rq1);
1103         s2 = blk_rq_pos(rq2);
1104
1105         /*
1106          * by definition, 1KiB is 2 sectors
1107          */
1108         back_max = cfqd->cfq_back_max * 2;
1109
1110         /*
1111          * Strict one way elevator _except_ in the case where we allow
1112          * short backward seeks which are biased as twice the cost of a
1113          * similar forward seek.
1114          */
1115         if (s1 >= last)
1116                 d1 = s1 - last;
1117         else if (s1 + back_max >= last)
1118                 d1 = (last - s1) * cfqd->cfq_back_penalty;
1119         else
1120                 wrap |= CFQ_RQ1_WRAP;
1121
1122         if (s2 >= last)
1123                 d2 = s2 - last;
1124         else if (s2 + back_max >= last)
1125                 d2 = (last - s2) * cfqd->cfq_back_penalty;
1126         else
1127                 wrap |= CFQ_RQ2_WRAP;
1128
1129         /* Found required data */
1130
1131         /*
1132          * By doing switch() on the bit mask "wrap" we avoid having to
1133          * check two variables for all permutations: --> faster!
1134          */
1135         switch (wrap) {
1136         case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1137                 if (d1 < d2)
1138                         return rq1;
1139                 else if (d2 < d1)
1140                         return rq2;
1141                 else {
1142                         if (s1 >= s2)
1143                                 return rq1;
1144                         else
1145                                 return rq2;
1146                 }
1147
1148         case CFQ_RQ2_WRAP:
1149                 return rq1;
1150         case CFQ_RQ1_WRAP:
1151                 return rq2;
1152         case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
1153         default:
1154                 /*
1155                  * Since both rqs are wrapped,
1156                  * start with the one that's further behind head
1157                  * (--> only *one* back seek required),
1158                  * since back seek takes more time than forward.
1159                  */
1160                 if (s1 <= s2)
1161                         return rq1;
1162                 else
1163                         return rq2;
1164         }
1165 }
1166
1167 static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
1168 {
1169         /* Service tree is empty */
1170         if (!root->count)
1171                 return NULL;
1172
1173         return rb_entry(rb_first_cached(&root->rb), struct cfq_queue, rb_node);
1174 }
1175
1176 static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
1177 {
1178         return rb_entry_cfqg(rb_first_cached(&root->rb));
1179 }
1180
1181 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
1182 {
1183         rb_erase_cached(n, &root->rb);
1184         RB_CLEAR_NODE(n);
1185
1186         --root->count;
1187 }
1188
1189 /*
1190  * would be nice to take fifo expire time into account as well
1191  */
1192 static struct request *
1193 cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1194                   struct request *last)
1195 {
1196         struct rb_node *rbnext = rb_next(&last->rb_node);
1197         struct rb_node *rbprev = rb_prev(&last->rb_node);
1198         struct request *next = NULL, *prev = NULL;
1199
1200         BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1201
1202         if (rbprev)
1203                 prev = rb_entry_rq(rbprev);
1204
1205         if (rbnext)
1206                 next = rb_entry_rq(rbnext);
1207         else {
1208                 rbnext = rb_first(&cfqq->sort_list);
1209                 if (rbnext && rbnext != &last->rb_node)
1210                         next = rb_entry_rq(rbnext);
1211         }
1212
1213         return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
1214 }
1215
1216 static u64 cfq_slice_offset(struct cfq_data *cfqd,
1217                             struct cfq_queue *cfqq)
1218 {
1219         /*
1220          * just an approximation, should be ok.
1221          */
1222         return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
1223                        cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
1224 }
1225
1226 static inline s64
1227 cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
1228 {
1229         return cfqg->vdisktime - st->min_vdisktime;
1230 }
1231
1232 static void
1233 __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1234 {
1235         struct rb_node **node = &st->rb.rb_root.rb_node;
1236         struct rb_node *parent = NULL;
1237         struct cfq_group *__cfqg;
1238         s64 key = cfqg_key(st, cfqg);
1239         bool leftmost = true;
1240
1241         while (*node != NULL) {
1242                 parent = *node;
1243                 __cfqg = rb_entry_cfqg(parent);
1244
1245                 if (key < cfqg_key(st, __cfqg))
1246                         node = &parent->rb_left;
1247                 else {
1248                         node = &parent->rb_right;
1249                         leftmost = false;
1250                 }
1251         }
1252
1253         rb_link_node(&cfqg->rb_node, parent, node);
1254         rb_insert_color_cached(&cfqg->rb_node, &st->rb, leftmost);
1255 }
1256
1257 /*
1258  * This has to be called only on activation of cfqg
1259  */
1260 static void
1261 cfq_update_group_weight(struct cfq_group *cfqg)
1262 {
1263         if (cfqg->new_weight) {
1264                 cfqg->weight = cfqg->new_weight;
1265                 cfqg->new_weight = 0;
1266         }
1267 }
1268
1269 static void
1270 cfq_update_group_leaf_weight(struct cfq_group *cfqg)
1271 {
1272         BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1273
1274         if (cfqg->new_leaf_weight) {
1275                 cfqg->leaf_weight = cfqg->new_leaf_weight;
1276                 cfqg->new_leaf_weight = 0;
1277         }
1278 }
1279
1280 static void
1281 cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1282 {
1283         unsigned int vfr = 1 << CFQ_SERVICE_SHIFT;      /* start with 1 */
1284         struct cfq_group *pos = cfqg;
1285         struct cfq_group *parent;
1286         bool propagate;
1287
1288         /* add to the service tree */
1289         BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1290
1291         /*
1292          * Update leaf_weight.  We cannot update weight at this point
1293          * because cfqg might already have been activated and is
1294          * contributing its current weight to the parent's child_weight.
1295          */
1296         cfq_update_group_leaf_weight(cfqg);
1297         __cfq_group_service_tree_add(st, cfqg);
1298
1299         /*
1300          * Activate @cfqg and calculate the portion of vfraction @cfqg is
1301          * entitled to.  vfraction is calculated by walking the tree
1302          * towards the root calculating the fraction it has at each level.
1303          * The compounded ratio is how much vfraction @cfqg owns.
1304          *
1305          * Start with the proportion tasks in this cfqg has against active
1306          * children cfqgs - its leaf_weight against children_weight.
1307          */
1308         propagate = !pos->nr_active++;
1309         pos->children_weight += pos->leaf_weight;
1310         vfr = vfr * pos->leaf_weight / pos->children_weight;
1311
1312         /*
1313          * Compound ->weight walking up the tree.  Both activation and
1314          * vfraction calculation are done in the same loop.  Propagation
1315          * stops once an already activated node is met.  vfraction
1316          * calculation should always continue to the root.
1317          */
1318         while ((parent = cfqg_parent(pos))) {
1319                 if (propagate) {
1320                         cfq_update_group_weight(pos);
1321                         propagate = !parent->nr_active++;
1322                         parent->children_weight += pos->weight;
1323                 }
1324                 vfr = vfr * pos->weight / parent->children_weight;
1325                 pos = parent;
1326         }
1327
1328         cfqg->vfraction = max_t(unsigned, vfr, 1);
1329 }
1330
1331 static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd)
1332 {
1333         if (!iops_mode(cfqd))
1334                 return CFQ_SLICE_MODE_GROUP_DELAY;
1335         else
1336                 return CFQ_IOPS_MODE_GROUP_DELAY;
1337 }
1338
1339 static void
1340 cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
1341 {
1342         struct cfq_rb_root *st = &cfqd->grp_service_tree;
1343         struct cfq_group *__cfqg;
1344         struct rb_node *n;
1345
1346         cfqg->nr_cfqq++;
1347         if (!RB_EMPTY_NODE(&cfqg->rb_node))
1348                 return;
1349
1350         /*
1351          * Currently put the group at the end. Later implement something
1352          * so that groups get lesser vtime based on their weights, so that
1353          * if group does not loose all if it was not continuously backlogged.
1354          */
1355         n = rb_last(&st->rb.rb_root);
1356         if (n) {
1357                 __cfqg = rb_entry_cfqg(n);
1358                 cfqg->vdisktime = __cfqg->vdisktime +
1359                         cfq_get_cfqg_vdisktime_delay(cfqd);
1360         } else
1361                 cfqg->vdisktime = st->min_vdisktime;
1362         cfq_group_service_tree_add(st, cfqg);
1363 }
1364
1365 static void
1366 cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
1367 {
1368         struct cfq_group *pos = cfqg;
1369         bool propagate;
1370
1371         /*
1372          * Undo activation from cfq_group_service_tree_add().  Deactivate
1373          * @cfqg and propagate deactivation upwards.
1374          */
1375         propagate = !--pos->nr_active;
1376         pos->children_weight -= pos->leaf_weight;
1377
1378         while (propagate) {
1379                 struct cfq_group *parent = cfqg_parent(pos);
1380
1381                 /* @pos has 0 nr_active at this point */
1382                 WARN_ON_ONCE(pos->children_weight);
1383                 pos->vfraction = 0;
1384
1385                 if (!parent)
1386                         break;
1387
1388                 propagate = !--parent->nr_active;
1389                 parent->children_weight -= pos->weight;
1390                 pos = parent;
1391         }
1392
1393         /* remove from the service tree */
1394         if (!RB_EMPTY_NODE(&cfqg->rb_node))
1395                 cfq_rb_erase(&cfqg->rb_node, st);
1396 }
1397
1398 static void
1399 cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
1400 {
1401         struct cfq_rb_root *st = &cfqd->grp_service_tree;
1402
1403         BUG_ON(cfqg->nr_cfqq < 1);
1404         cfqg->nr_cfqq--;
1405
1406         /* If there are other cfq queues under this group, don't delete it */
1407         if (cfqg->nr_cfqq)
1408                 return;
1409
1410         cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
1411         cfq_group_service_tree_del(st, cfqg);
1412         cfqg->saved_wl_slice = 0;
1413         cfqg_stats_update_dequeue(cfqg);
1414 }
1415
1416 static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
1417                                        u64 *unaccounted_time)
1418 {
1419         u64 slice_used;
1420         u64 now = ktime_get_ns();
1421
1422         /*
1423          * Queue got expired before even a single request completed or
1424          * got expired immediately after first request completion.
1425          */
1426         if (!cfqq->slice_start || cfqq->slice_start == now) {
1427                 /*
1428                  * Also charge the seek time incurred to the group, otherwise
1429                  * if there are mutiple queues in the group, each can dispatch
1430                  * a single request on seeky media and cause lots of seek time
1431                  * and group will never know it.
1432                  */
1433                 slice_used = max_t(u64, (now - cfqq->dispatch_start),
1434                                         jiffies_to_nsecs(1));
1435         } else {
1436                 slice_used = now - cfqq->slice_start;
1437                 if (slice_used > cfqq->allocated_slice) {
1438                         *unaccounted_time = slice_used - cfqq->allocated_slice;
1439                         slice_used = cfqq->allocated_slice;
1440                 }
1441                 if (cfqq->slice_start > cfqq->dispatch_start)
1442                         *unaccounted_time += cfqq->slice_start -
1443                                         cfqq->dispatch_start;
1444         }
1445
1446         return slice_used;
1447 }
1448
1449 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1450                                 struct cfq_queue *cfqq)
1451 {
1452         struct cfq_rb_root *st = &cfqd->grp_service_tree;
1453         u64 used_sl, charge, unaccounted_sl = 0;
1454         int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
1455                         - cfqg->service_tree_idle.count;
1456         unsigned int vfr;
1457         u64 now = ktime_get_ns();
1458
1459         BUG_ON(nr_sync < 0);
1460         used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
1461
1462         if (iops_mode(cfqd))
1463                 charge = cfqq->slice_dispatch;
1464         else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
1465                 charge = cfqq->allocated_slice;
1466
1467         /*
1468          * Can't update vdisktime while on service tree and cfqg->vfraction
1469          * is valid only while on it.  Cache vfr, leave the service tree,
1470          * update vdisktime and go back on.  The re-addition to the tree
1471          * will also update the weights as necessary.
1472          */
1473         vfr = cfqg->vfraction;
1474         cfq_group_service_tree_del(st, cfqg);
1475         cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
1476         cfq_group_service_tree_add(st, cfqg);
1477
1478         /* This group is being expired. Save the context */
1479         if (cfqd->workload_expires > now) {
1480                 cfqg->saved_wl_slice = cfqd->workload_expires - now;
1481                 cfqg->saved_wl_type = cfqd->serving_wl_type;
1482                 cfqg->saved_wl_class = cfqd->serving_wl_class;
1483         } else
1484                 cfqg->saved_wl_slice = 0;
1485
1486         cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
1487                                         st->min_vdisktime);
1488         cfq_log_cfqq(cfqq->cfqd, cfqq,
1489                      "sl_used=%llu disp=%llu charge=%llu iops=%u sect=%lu",
1490                      used_sl, cfqq->slice_dispatch, charge,
1491                      iops_mode(cfqd), cfqq->nr_sectors);
1492         cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
1493         cfqg_stats_set_start_empty_time(cfqg);
1494 }
1495
1496 /**
1497  * cfq_init_cfqg_base - initialize base part of a cfq_group
1498  * @cfqg: cfq_group to initialize
1499  *
1500  * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
1501  * is enabled or not.
1502  */
1503 static void cfq_init_cfqg_base(struct cfq_group *cfqg)
1504 {
1505         struct cfq_rb_root *st;
1506         int i, j;
1507
1508         for_each_cfqg_st(cfqg, i, j, st)
1509                 *st = CFQ_RB_ROOT;
1510         RB_CLEAR_NODE(&cfqg->rb_node);
1511
1512         cfqg->ttime.last_end_request = ktime_get_ns();
1513 }
1514
1515 #ifdef CONFIG_CFQ_GROUP_IOSCHED
1516 static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1517                             bool on_dfl, bool reset_dev, bool is_leaf_weight);
1518
1519 static void cfqg_stats_exit(struct cfqg_stats *stats)
1520 {
1521         blkg_rwstat_exit(&stats->merged);
1522         blkg_rwstat_exit(&stats->service_time);
1523         blkg_rwstat_exit(&stats->wait_time);
1524         blkg_rwstat_exit(&stats->queued);
1525         blkg_stat_exit(&stats->time);
1526 #ifdef CONFIG_DEBUG_BLK_CGROUP
1527         blkg_stat_exit(&stats->unaccounted_time);
1528         blkg_stat_exit(&stats->avg_queue_size_sum);
1529         blkg_stat_exit(&stats->avg_queue_size_samples);
1530         blkg_stat_exit(&stats->dequeue);
1531         blkg_stat_exit(&stats->group_wait_time);
1532         blkg_stat_exit(&stats->idle_time);
1533         blkg_stat_exit(&stats->empty_time);
1534 #endif
1535 }
1536
1537 static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
1538 {
1539         if (blkg_rwstat_init(&stats->merged, gfp) ||
1540             blkg_rwstat_init(&stats->service_time, gfp) ||
1541             blkg_rwstat_init(&stats->wait_time, gfp) ||
1542             blkg_rwstat_init(&stats->queued, gfp) ||
1543             blkg_stat_init(&stats->time, gfp))
1544                 goto err;
1545
1546 #ifdef CONFIG_DEBUG_BLK_CGROUP
1547         if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
1548             blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
1549             blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
1550             blkg_stat_init(&stats->dequeue, gfp) ||
1551             blkg_stat_init(&stats->group_wait_time, gfp) ||
1552             blkg_stat_init(&stats->idle_time, gfp) ||
1553             blkg_stat_init(&stats->empty_time, gfp))
1554                 goto err;
1555 #endif
1556         return 0;
1557 err:
1558         cfqg_stats_exit(stats);
1559         return -ENOMEM;
1560 }
1561
1562 static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
1563 {
1564         struct cfq_group_data *cgd;
1565
1566         cgd = kzalloc(sizeof(*cgd), gfp);
1567         if (!cgd)
1568                 return NULL;
1569         return &cgd->cpd;
1570 }
1571
1572 static void cfq_cpd_init(struct blkcg_policy_data *cpd)
1573 {
1574         struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
1575         unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
1576                               CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1577
1578         if (cpd_to_blkcg(cpd) == &blkcg_root)
1579                 weight *= 2;
1580
1581         cgd->weight = weight;
1582         cgd->leaf_weight = weight;
1583 }
1584
1585 static void cfq_cpd_free(struct blkcg_policy_data *cpd)
1586 {
1587         kfree(cpd_to_cfqgd(cpd));
1588 }
1589
1590 static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
1591 {
1592         struct blkcg *blkcg = cpd_to_blkcg(cpd);
1593         bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys);
1594         unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1595
1596         if (blkcg == &blkcg_root)
1597                 weight *= 2;
1598
1599         WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
1600         WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
1601 }
1602
1603 static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
1604 {
1605         struct cfq_group *cfqg;
1606
1607         cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
1608         if (!cfqg)
1609                 return NULL;
1610
1611         cfq_init_cfqg_base(cfqg);
1612         if (cfqg_stats_init(&cfqg->stats, gfp)) {
1613                 kfree(cfqg);
1614                 return NULL;
1615         }
1616
1617         return &cfqg->pd;
1618 }
1619
1620 static void cfq_pd_init(struct blkg_policy_data *pd)
1621 {
1622         struct cfq_group *cfqg = pd_to_cfqg(pd);
1623         struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
1624
1625         cfqg->weight = cgd->weight;
1626         cfqg->leaf_weight = cgd->leaf_weight;
1627 }
1628
1629 static void cfq_pd_offline(struct blkg_policy_data *pd)
1630 {
1631         struct cfq_group *cfqg = pd_to_cfqg(pd);
1632         int i;
1633
1634         for (i = 0; i < IOPRIO_BE_NR; i++) {
1635                 if (cfqg->async_cfqq[0][i])
1636                         cfq_put_queue(cfqg->async_cfqq[0][i]);
1637                 if (cfqg->async_cfqq[1][i])
1638                         cfq_put_queue(cfqg->async_cfqq[1][i]);
1639         }
1640
1641         if (cfqg->async_idle_cfqq)
1642                 cfq_put_queue(cfqg->async_idle_cfqq);
1643
1644         /*
1645          * @blkg is going offline and will be ignored by
1646          * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
1647          * that they don't get lost.  If IOs complete after this point, the
1648          * stats for them will be lost.  Oh well...
1649          */
1650         cfqg_stats_xfer_dead(cfqg);
1651 }
1652
1653 static void cfq_pd_free(struct blkg_policy_data *pd)
1654 {
1655         struct cfq_group *cfqg = pd_to_cfqg(pd);
1656
1657         cfqg_stats_exit(&cfqg->stats);
1658         return kfree(cfqg);
1659 }
1660
1661 static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
1662 {
1663         struct cfq_group *cfqg = pd_to_cfqg(pd);
1664
1665         cfqg_stats_reset(&cfqg->stats);
1666 }
1667
1668 static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
1669                                          struct blkcg *blkcg)
1670 {
1671         struct blkcg_gq *blkg;
1672
1673         blkg = blkg_lookup(blkcg, cfqd->queue);
1674         if (likely(blkg))
1675                 return blkg_to_cfqg(blkg);
1676         return NULL;
1677 }
1678
1679 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1680 {
1681         cfqq->cfqg = cfqg;
1682         /* cfqq reference on cfqg */
1683         cfqg_get(cfqg);
1684 }
1685
1686 static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1687                                      struct blkg_policy_data *pd, int off)
1688 {
1689         struct cfq_group *cfqg = pd_to_cfqg(pd);
1690
1691         if (!cfqg->dev_weight)
1692                 return 0;
1693         return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1694 }
1695
1696 static int cfqg_print_weight_device(struct seq_file *sf, void *v)
1697 {
1698         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1699                           cfqg_prfill_weight_device, &blkcg_policy_cfq,
1700                           0, false);
1701         return 0;
1702 }
1703
1704 static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1705                                           struct blkg_policy_data *pd, int off)
1706 {
1707         struct cfq_group *cfqg = pd_to_cfqg(pd);
1708
1709         if (!cfqg->dev_leaf_weight)
1710                 return 0;
1711         return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1712 }
1713
1714 static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
1715 {
1716         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1717                           cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
1718                           0, false);
1719         return 0;
1720 }
1721
1722 static int cfq_print_weight(struct seq_file *sf, void *v)
1723 {
1724         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1725         struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1726         unsigned int val = 0;
1727
1728         if (cgd)
1729                 val = cgd->weight;
1730
1731         seq_printf(sf, "%u\n", val);
1732         return 0;
1733 }
1734
1735 static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
1736 {
1737         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1738         struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1739         unsigned int val = 0;
1740
1741         if (cgd)
1742                 val = cgd->leaf_weight;
1743
1744         seq_printf(sf, "%u\n", val);
1745         return 0;
1746 }
1747
1748 static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
1749                                         char *buf, size_t nbytes, loff_t off,
1750                                         bool on_dfl, bool is_leaf_weight)
1751 {
1752         unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1753         unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1754         struct blkcg *blkcg = css_to_blkcg(of_css(of));
1755         struct blkg_conf_ctx ctx;
1756         struct cfq_group *cfqg;
1757         struct cfq_group_data *cfqgd;
1758         int ret;
1759         u64 v;
1760
1761         ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
1762         if (ret)
1763                 return ret;
1764
1765         if (sscanf(ctx.body, "%llu", &v) == 1) {
1766                 /* require "default" on dfl */
1767                 ret = -ERANGE;
1768                 if (!v && on_dfl)
1769                         goto out_finish;
1770         } else if (!strcmp(strim(ctx.body), "default")) {
1771                 v = 0;
1772         } else {
1773                 ret = -EINVAL;
1774                 goto out_finish;
1775         }
1776
1777         cfqg = blkg_to_cfqg(ctx.blkg);
1778         cfqgd = blkcg_to_cfqgd(blkcg);
1779
1780         ret = -ERANGE;
1781         if (!v || (v >= min && v <= max)) {
1782                 if (!is_leaf_weight) {
1783                         cfqg->dev_weight = v;
1784                         cfqg->new_weight = v ?: cfqgd->weight;
1785                 } else {
1786                         cfqg->dev_leaf_weight = v;
1787                         cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
1788                 }
1789                 ret = 0;
1790         }
1791 out_finish:
1792         blkg_conf_finish(&ctx);
1793         return ret ?: nbytes;
1794 }
1795
1796 static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
1797                                       char *buf, size_t nbytes, loff_t off)
1798 {
1799         return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
1800 }
1801
1802 static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
1803                                            char *buf, size_t nbytes, loff_t off)
1804 {
1805         return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
1806 }
1807
1808 static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1809                             bool on_dfl, bool reset_dev, bool is_leaf_weight)
1810 {
1811         unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1812         unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1813         struct blkcg *blkcg = css_to_blkcg(css);
1814         struct blkcg_gq *blkg;
1815         struct cfq_group_data *cfqgd;
1816         int ret = 0;
1817
1818         if (val < min || val > max)
1819                 return -ERANGE;
1820
1821         spin_lock_irq(&blkcg->lock);
1822         cfqgd = blkcg_to_cfqgd(blkcg);
1823         if (!cfqgd) {
1824                 ret = -EINVAL;
1825                 goto out;
1826         }
1827
1828         if (!is_leaf_weight)
1829                 cfqgd->weight = val;
1830         else
1831                 cfqgd->leaf_weight = val;
1832
1833         hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
1834                 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1835
1836                 if (!cfqg)
1837                         continue;
1838
1839                 if (!is_leaf_weight) {
1840                         if (reset_dev)
1841                                 cfqg->dev_weight = 0;
1842                         if (!cfqg->dev_weight)
1843                                 cfqg->new_weight = cfqgd->weight;
1844                 } else {
1845                         if (reset_dev)
1846                                 cfqg->dev_leaf_weight = 0;
1847                         if (!cfqg->dev_leaf_weight)
1848                                 cfqg->new_leaf_weight = cfqgd->leaf_weight;
1849                 }
1850         }
1851
1852 out:
1853         spin_unlock_irq(&blkcg->lock);
1854         return ret;
1855 }
1856
1857 static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1858                           u64 val)
1859 {
1860         return __cfq_set_weight(css, val, false, false, false);
1861 }
1862
1863 static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
1864                                struct cftype *cft, u64 val)
1865 {
1866         return __cfq_set_weight(css, val, false, false, true);
1867 }
1868
1869 static int cfqg_print_stat(struct seq_file *sf, void *v)
1870 {
1871         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
1872                           &blkcg_policy_cfq, seq_cft(sf)->private, false);
1873         return 0;
1874 }
1875
1876 static int cfqg_print_rwstat(struct seq_file *sf, void *v)
1877 {
1878         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
1879                           &blkcg_policy_cfq, seq_cft(sf)->private, true);
1880         return 0;
1881 }
1882
1883 static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
1884                                       struct blkg_policy_data *pd, int off)
1885 {
1886         u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
1887                                           &blkcg_policy_cfq, off);
1888         return __blkg_prfill_u64(sf, pd, sum);
1889 }
1890
1891 static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
1892                                         struct blkg_policy_data *pd, int off)
1893 {
1894         struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
1895                                                         &blkcg_policy_cfq, off);
1896         return __blkg_prfill_rwstat(sf, pd, &sum);
1897 }
1898
1899 static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
1900 {
1901         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1902                           cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
1903                           seq_cft(sf)->private, false);
1904         return 0;
1905 }
1906
1907 static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
1908 {
1909         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1910                           cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
1911                           seq_cft(sf)->private, true);
1912         return 0;
1913 }
1914
1915 static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
1916                                int off)
1917 {
1918         u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
1919
1920         return __blkg_prfill_u64(sf, pd, sum >> 9);
1921 }
1922
1923 static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
1924 {
1925         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1926                           cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
1927         return 0;
1928 }
1929
1930 static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
1931                                          struct blkg_policy_data *pd, int off)
1932 {
1933         struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
1934                                         offsetof(struct blkcg_gq, stat_bytes));
1935         u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
1936                 atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
1937
1938         return __blkg_prfill_u64(sf, pd, sum >> 9);
1939 }
1940
1941 static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
1942 {
1943         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1944                           cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
1945                           false);
1946         return 0;
1947 }
1948
1949 #ifdef CONFIG_DEBUG_BLK_CGROUP
1950 static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1951                                       struct blkg_policy_data *pd, int off)
1952 {
1953         struct cfq_group *cfqg = pd_to_cfqg(pd);
1954         u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
1955         u64 v = 0;
1956
1957         if (samples) {
1958                 v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
1959                 v = div64_u64(v, samples);
1960         }
1961         __blkg_prfill_u64(sf, pd, v);
1962         return 0;
1963 }
1964
1965 /* print avg_queue_size */
1966 static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
1967 {
1968         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1969                           cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
1970                           0, false);
1971         return 0;
1972 }
1973 #endif  /* CONFIG_DEBUG_BLK_CGROUP */
1974
1975 static struct cftype cfq_blkcg_legacy_files[] = {
1976         /* on root, weight is mapped to leaf_weight */
1977         {
1978                 .name = "weight_device",
1979                 .flags = CFTYPE_ONLY_ON_ROOT,
1980                 .seq_show = cfqg_print_leaf_weight_device,
1981                 .write = cfqg_set_leaf_weight_device,
1982         },
1983         {
1984                 .name = "weight",
1985                 .flags = CFTYPE_ONLY_ON_ROOT,
1986                 .seq_show = cfq_print_leaf_weight,
1987                 .write_u64 = cfq_set_leaf_weight,
1988         },
1989
1990         /* no such mapping necessary for !roots */
1991         {
1992                 .name = "weight_device",
1993                 .flags = CFTYPE_NOT_ON_ROOT,
1994                 .seq_show = cfqg_print_weight_device,
1995                 .write = cfqg_set_weight_device,
1996         },
1997         {
1998                 .name = "weight",
1999                 .flags = CFTYPE_NOT_ON_ROOT,
2000                 .seq_show = cfq_print_weight,
2001                 .write_u64 = cfq_set_weight,
2002         },
2003
2004         {
2005                 .name = "leaf_weight_device",
2006                 .seq_show = cfqg_print_leaf_weight_device,
2007                 .write = cfqg_set_leaf_weight_device,
2008         },
2009         {
2010                 .name = "leaf_weight",
2011                 .seq_show = cfq_print_leaf_weight,
2012                 .write_u64 = cfq_set_leaf_weight,
2013         },
2014
2015         /* statistics, covers only the tasks in the cfqg */
2016         {
2017                 .name = "time",
2018                 .private = offsetof(struct cfq_group, stats.time),
2019                 .seq_show = cfqg_print_stat,
2020         },
2021         {
2022                 .name = "sectors",
2023                 .seq_show = cfqg_print_stat_sectors,
2024         },
2025         {
2026                 .name = "io_service_bytes",
2027                 .private = (unsigned long)&blkcg_policy_cfq,
2028                 .seq_show = blkg_print_stat_bytes,
2029         },
2030         {
2031                 .name = "io_serviced",
2032                 .private = (unsigned long)&blkcg_policy_cfq,
2033                 .seq_show = blkg_print_stat_ios,
2034         },
2035         {
2036                 .name = "io_service_time",
2037                 .private = offsetof(struct cfq_group, stats.service_time),
2038                 .seq_show = cfqg_print_rwstat,
2039         },
2040         {
2041                 .name = "io_wait_time",
2042                 .private = offsetof(struct cfq_group, stats.wait_time),
2043                 .seq_show = cfqg_print_rwstat,
2044         },
2045         {
2046                 .name = "io_merged",
2047                 .private = offsetof(struct cfq_group, stats.merged),
2048                 .seq_show = cfqg_print_rwstat,
2049         },
2050         {
2051                 .name = "io_queued",
2052                 .private = offsetof(struct cfq_group, stats.queued),
2053                 .seq_show = cfqg_print_rwstat,
2054         },
2055
2056         /* the same statictics which cover the cfqg and its descendants */
2057         {
2058                 .name = "time_recursive",
2059                 .private = offsetof(struct cfq_group, stats.time),
2060                 .seq_show = cfqg_print_stat_recursive,
2061         },
2062         {
2063                 .name = "sectors_recursive",
2064                 .seq_show = cfqg_print_stat_sectors_recursive,
2065         },
2066         {
2067                 .name = "io_service_bytes_recursive",
2068                 .private = (unsigned long)&blkcg_policy_cfq,
2069                 .seq_show = blkg_print_stat_bytes_recursive,
2070         },
2071         {
2072                 .name = "io_serviced_recursive",
2073                 .private = (unsigned long)&blkcg_policy_cfq,
2074                 .seq_show = blkg_print_stat_ios_recursive,
2075         },
2076         {
2077                 .name = "io_service_time_recursive",
2078                 .private = offsetof(struct cfq_group, stats.service_time),
2079                 .seq_show = cfqg_print_rwstat_recursive,
2080         },
2081         {
2082                 .name = "io_wait_time_recursive",
2083                 .private = offsetof(struct cfq_group, stats.wait_time),
2084                 .seq_show = cfqg_print_rwstat_recursive,
2085         },
2086         {
2087                 .name = "io_merged_recursive",
2088                 .private = offsetof(struct cfq_group, stats.merged),
2089                 .seq_show = cfqg_print_rwstat_recursive,
2090         },
2091         {
2092                 .name = "io_queued_recursive",
2093                 .private = offsetof(struct cfq_group, stats.queued),
2094                 .seq_show = cfqg_print_rwstat_recursive,
2095         },
2096 #ifdef CONFIG_DEBUG_BLK_CGROUP
2097         {
2098                 .name = "avg_queue_size",
2099                 .seq_show = cfqg_print_avg_queue_size,
2100         },
2101         {
2102                 .name = "group_wait_time",
2103                 .private = offsetof(struct cfq_group, stats.group_wait_time),
2104                 .seq_show = cfqg_print_stat,
2105         },
2106         {
2107                 .name = "idle_time",
2108                 .private = offsetof(struct cfq_group, stats.idle_time),
2109                 .seq_show = cfqg_print_stat,
2110         },
2111         {
2112                 .name = "empty_time",
2113                 .private = offsetof(struct cfq_group, stats.empty_time),
2114                 .seq_show = cfqg_print_stat,
2115         },
2116         {
2117                 .name = "dequeue",
2118                 .private = offsetof(struct cfq_group, stats.dequeue),
2119                 .seq_show = cfqg_print_stat,
2120         },
2121         {
2122                 .name = "unaccounted_time",
2123                 .private = offsetof(struct cfq_group, stats.unaccounted_time),
2124                 .seq_show = cfqg_print_stat,
2125         },
2126 #endif  /* CONFIG_DEBUG_BLK_CGROUP */
2127         { }     /* terminate */
2128 };
2129
2130 static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
2131 {
2132         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2133         struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
2134
2135         seq_printf(sf, "default %u\n", cgd->weight);
2136         blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
2137                           &blkcg_policy_cfq, 0, false);
2138         return 0;
2139 }
2140
2141 static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
2142                                      char *buf, size_t nbytes, loff_t off)
2143 {
2144         char *endp;
2145         int ret;
2146         u64 v;
2147
2148         buf = strim(buf);
2149
2150         /* "WEIGHT" or "default WEIGHT" sets the default weight */
2151         v = simple_strtoull(buf, &endp, 0);
2152         if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
2153                 ret = __cfq_set_weight(of_css(of), v, true, false, false);
2154                 return ret ?: nbytes;
2155         }
2156
2157         /* "MAJ:MIN WEIGHT" */
2158         return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
2159 }
2160
2161 static struct cftype cfq_blkcg_files[] = {
2162         {
2163                 .name = "weight",
2164                 .flags = CFTYPE_NOT_ON_ROOT,
2165                 .seq_show = cfq_print_weight_on_dfl,
2166                 .write = cfq_set_weight_on_dfl,
2167         },
2168         { }     /* terminate */
2169 };
2170
2171 #else /* GROUP_IOSCHED */
2172 static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
2173                                          struct blkcg *blkcg)
2174 {
2175         return cfqd->root_group;
2176 }
2177
2178 static inline void
2179 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
2180         cfqq->cfqg = cfqg;
2181 }
2182
2183 #endif /* GROUP_IOSCHED */
2184
2185 /*
2186  * The cfqd->service_trees holds all pending cfq_queue's that have
2187  * requests waiting to be processed. It is sorted in the order that
2188  * we will service the queues.
2189  */
2190 static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2191                                  bool add_front)
2192 {
2193         struct rb_node **p, *parent;
2194         struct cfq_queue *__cfqq;
2195         u64 rb_key;
2196         struct cfq_rb_root *st;
2197         bool leftmost = true;
2198         int new_cfqq = 1;
2199         u64 now = ktime_get_ns();
2200
2201         st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
2202         if (cfq_class_idle(cfqq)) {
2203                 rb_key = CFQ_IDLE_DELAY;
2204                 parent = rb_last(&st->rb.rb_root);
2205                 if (parent && parent != &cfqq->rb_node) {
2206                         __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2207                         rb_key += __cfqq->rb_key;
2208                 } else
2209                         rb_key += now;
2210         } else if (!add_front) {
2211                 /*
2212                  * Get our rb key offset. Subtract any residual slice
2213                  * value carried from last service. A negative resid
2214                  * count indicates slice overrun, and this should position
2215                  * the next service time further away in the tree.
2216                  */
2217                 rb_key = cfq_slice_offset(cfqd, cfqq) + now;
2218                 rb_key -= cfqq->slice_resid;
2219                 cfqq->slice_resid = 0;
2220         } else {
2221                 rb_key = -NSEC_PER_SEC;
2222                 __cfqq = cfq_rb_first(st);
2223                 rb_key += __cfqq ? __cfqq->rb_key : now;
2224         }
2225
2226         if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2227                 new_cfqq = 0;
2228                 /*
2229                  * same position, nothing more to do
2230                  */
2231                 if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
2232                         return;
2233
2234                 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2235                 cfqq->service_tree = NULL;
2236         }
2237
2238         parent = NULL;
2239         cfqq->service_tree = st;
2240         p = &st->rb.rb_root.rb_node;
2241         while (*p) {
2242                 parent = *p;
2243                 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2244
2245                 /*
2246                  * sort by key, that represents service time.
2247                  */
2248                 if (rb_key < __cfqq->rb_key)
2249                         p = &parent->rb_left;
2250                 else {
2251                         p = &parent->rb_right;
2252                         leftmost = false;
2253                 }
2254         }
2255
2256         cfqq->rb_key = rb_key;
2257         rb_link_node(&cfqq->rb_node, parent, p);
2258         rb_insert_color_cached(&cfqq->rb_node, &st->rb, leftmost);
2259         st->count++;
2260         if (add_front || !new_cfqq)
2261                 return;
2262         cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
2263 }
2264
2265 static struct cfq_queue *
2266 cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
2267                      sector_t sector, struct rb_node **ret_parent,
2268                      struct rb_node ***rb_link)
2269 {
2270         struct rb_node **p, *parent;
2271         struct cfq_queue *cfqq = NULL;
2272
2273         parent = NULL;
2274         p = &root->rb_node;
2275         while (*p) {
2276                 struct rb_node **n;
2277
2278                 parent = *p;
2279                 cfqq = rb_entry(parent, struct cfq_queue, p_node);
2280
2281                 /*
2282                  * Sort strictly based on sector.  Smallest to the left,
2283                  * largest to the right.
2284                  */
2285                 if (sector > blk_rq_pos(cfqq->next_rq))
2286                         n = &(*p)->rb_right;
2287                 else if (sector < blk_rq_pos(cfqq->next_rq))
2288                         n = &(*p)->rb_left;
2289                 else
2290                         break;
2291                 p = n;
2292                 cfqq = NULL;
2293         }
2294
2295         *ret_parent = parent;
2296         if (rb_link)
2297                 *rb_link = p;
2298         return cfqq;
2299 }
2300
2301 static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2302 {
2303         struct rb_node **p, *parent;
2304         struct cfq_queue *__cfqq;
2305
2306         if (cfqq->p_root) {
2307                 rb_erase(&cfqq->p_node, cfqq->p_root);
2308                 cfqq->p_root = NULL;
2309         }
2310
2311         if (cfq_class_idle(cfqq))
2312                 return;
2313         if (!cfqq->next_rq)
2314                 return;
2315
2316         cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
2317         __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
2318                                       blk_rq_pos(cfqq->next_rq), &parent, &p);
2319         if (!__cfqq) {
2320                 rb_link_node(&cfqq->p_node, parent, p);
2321                 rb_insert_color(&cfqq->p_node, cfqq->p_root);
2322         } else
2323                 cfqq->p_root = NULL;
2324 }
2325
2326 /*
2327  * Update cfqq's position in the service tree.
2328  */
2329 static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2330 {
2331         /*
2332          * Resorting requires the cfqq to be on the RR list already.
2333          */
2334         if (cfq_cfqq_on_rr(cfqq)) {
2335                 cfq_service_tree_add(cfqd, cfqq, 0);
2336                 cfq_prio_tree_add(cfqd, cfqq);
2337         }
2338 }
2339
2340 /*
2341  * add to busy list of queues for service, trying to be fair in ordering
2342  * the pending list according to last request service
2343  */
2344 static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2345 {
2346         cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
2347         BUG_ON(cfq_cfqq_on_rr(cfqq));
2348         cfq_mark_cfqq_on_rr(cfqq);
2349         cfqd->busy_queues++;
2350         if (cfq_cfqq_sync(cfqq))
2351                 cfqd->busy_sync_queues++;
2352
2353         cfq_resort_rr_list(cfqd, cfqq);
2354 }
2355
2356 /*
2357  * Called when the cfqq no longer has requests pending, remove it from
2358  * the service tree.
2359  */
2360 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2361 {
2362         cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
2363         BUG_ON(!cfq_cfqq_on_rr(cfqq));
2364         cfq_clear_cfqq_on_rr(cfqq);
2365
2366         if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2367                 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2368                 cfqq->service_tree = NULL;
2369         }
2370         if (cfqq->p_root) {
2371                 rb_erase(&cfqq->p_node, cfqq->p_root);
2372                 cfqq->p_root = NULL;
2373         }
2374
2375         cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
2376         BUG_ON(!cfqd->busy_queues);
2377         cfqd->busy_queues--;
2378         if (cfq_cfqq_sync(cfqq))
2379                 cfqd->busy_sync_queues--;
2380 }
2381
2382 /*
2383  * rb tree support functions
2384  */
2385 static void cfq_del_rq_rb(struct request *rq)
2386 {
2387         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2388         const int sync = rq_is_sync(rq);
2389
2390         BUG_ON(!cfqq->queued[sync]);
2391         cfqq->queued[sync]--;
2392
2393         elv_rb_del(&cfqq->sort_list, rq);
2394
2395         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
2396                 /*
2397                  * Queue will be deleted from service tree when we actually
2398                  * expire it later. Right now just remove it from prio tree
2399                  * as it is empty.
2400                  */
2401                 if (cfqq->p_root) {
2402                         rb_erase(&cfqq->p_node, cfqq->p_root);
2403                         cfqq->p_root = NULL;
2404                 }
2405         }
2406 }
2407
2408 static void cfq_add_rq_rb(struct request *rq)
2409 {
2410         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2411         struct cfq_data *cfqd = cfqq->cfqd;
2412         struct request *prev;
2413
2414         cfqq->queued[rq_is_sync(rq)]++;
2415
2416         elv_rb_add(&cfqq->sort_list, rq);
2417
2418         if (!cfq_cfqq_on_rr(cfqq))
2419                 cfq_add_cfqq_rr(cfqd, cfqq);
2420
2421         /*
2422          * check if this request is a better next-serve candidate
2423          */
2424         prev = cfqq->next_rq;
2425         cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
2426
2427         /*
2428          * adjust priority tree position, if ->next_rq changes
2429          */
2430         if (prev != cfqq->next_rq)
2431                 cfq_prio_tree_add(cfqd, cfqq);
2432
2433         BUG_ON(!cfqq->next_rq);
2434 }
2435
2436 static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
2437 {
2438         elv_rb_del(&cfqq->sort_list, rq);
2439         cfqq->queued[rq_is_sync(rq)]--;
2440         cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2441         cfq_add_rq_rb(rq);
2442         cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
2443                                  rq->cmd_flags);
2444 }
2445
2446 static struct request *
2447 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
2448 {
2449         struct task_struct *tsk = current;
2450         struct cfq_io_cq *cic;
2451         struct cfq_queue *cfqq;
2452
2453         cic = cfq_cic_lookup(cfqd, tsk->io_context);
2454         if (!cic)
2455                 return NULL;
2456
2457         cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf));
2458         if (cfqq)
2459                 return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
2460
2461         return NULL;
2462 }
2463
2464 static void cfq_activate_request(struct request_queue *q, struct request *rq)
2465 {
2466         struct cfq_data *cfqd = q->elevator->elevator_data;
2467
2468         cfqd->rq_in_driver++;
2469         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
2470                                                 cfqd->rq_in_driver);
2471
2472         cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
2473 }
2474
2475 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
2476 {
2477         struct cfq_data *cfqd = q->elevator->elevator_data;
2478
2479         WARN_ON(!cfqd->rq_in_driver);
2480         cfqd->rq_in_driver--;
2481         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
2482                                                 cfqd->rq_in_driver);
2483 }
2484
2485 static void cfq_remove_request(struct request *rq)
2486 {
2487         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2488
2489         if (cfqq->next_rq == rq)
2490                 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
2491
2492         list_del_init(&rq->queuelist);
2493         cfq_del_rq_rb(rq);
2494
2495         cfqq->cfqd->rq_queued--;
2496         cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2497         if (rq->cmd_flags & REQ_PRIO) {
2498                 WARN_ON(!cfqq->prio_pending);
2499                 cfqq->prio_pending--;
2500         }
2501 }
2502
2503 static enum elv_merge cfq_merge(struct request_queue *q, struct request **req,
2504                      struct bio *bio)
2505 {
2506         struct cfq_data *cfqd = q->elevator->elevator_data;
2507         struct request *__rq;
2508
2509         __rq = cfq_find_rq_fmerge(cfqd, bio);
2510         if (__rq && elv_bio_merge_ok(__rq, bio)) {
2511                 *req = __rq;
2512                 return ELEVATOR_FRONT_MERGE;
2513         }
2514
2515         return ELEVATOR_NO_MERGE;
2516 }
2517
2518 static void cfq_merged_request(struct request_queue *q, struct request *req,
2519                                enum elv_merge type)
2520 {
2521         if (type == ELEVATOR_FRONT_MERGE) {
2522                 struct cfq_queue *cfqq = RQ_CFQQ(req);
2523
2524                 cfq_reposition_rq_rb(cfqq, req);
2525         }
2526 }
2527
2528 static void cfq_bio_merged(struct request_queue *q, struct request *req,
2529                                 struct bio *bio)
2530 {
2531         cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf);
2532 }
2533
2534 static void
2535 cfq_merged_requests(struct request_queue *q, struct request *rq,
2536                     struct request *next)
2537 {
2538         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2539         struct cfq_data *cfqd = q->elevator->elevator_data;
2540
2541         /*
2542          * reposition in fifo if next is older than rq
2543          */
2544         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
2545             next->fifo_time < rq->fifo_time &&
2546             cfqq == RQ_CFQQ(next)) {
2547                 list_move(&rq->queuelist, &next->queuelist);
2548                 rq->fifo_time = next->fifo_time;
2549         }
2550
2551         if (cfqq->next_rq == next)
2552                 cfqq->next_rq = rq;
2553         cfq_remove_request(next);
2554         cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
2555
2556         cfqq = RQ_CFQQ(next);
2557         /*
2558          * all requests of this queue are merged to other queues, delete it
2559          * from the service tree. If it's the active_queue,
2560          * cfq_dispatch_requests() will choose to expire it or do idle
2561          */
2562         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
2563             cfqq != cfqd->active_queue)
2564                 cfq_del_cfqq_rr(cfqd, cfqq);
2565 }
2566
2567 static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq,
2568                                struct bio *bio)
2569 {
2570         struct cfq_data *cfqd = q->elevator->elevator_data;
2571         bool is_sync = op_is_sync(bio->bi_opf);
2572         struct cfq_io_cq *cic;
2573         struct cfq_queue *cfqq;
2574
2575         /*
2576          * Disallow merge of a sync bio into an async request.
2577          */
2578         if (is_sync && !rq_is_sync(rq))
2579                 return false;
2580
2581         /*
2582          * Lookup the cfqq that this bio will be queued with and allow
2583          * merge only if rq is queued there.
2584          */
2585         cic = cfq_cic_lookup(cfqd, current->io_context);
2586         if (!cic)
2587                 return false;
2588
2589         cfqq = cic_to_cfqq(cic, is_sync);
2590         return cfqq == RQ_CFQQ(rq);
2591 }
2592
2593 static int cfq_allow_rq_merge(struct request_queue *q, struct request *rq,
2594                               struct request *next)
2595 {
2596         return RQ_CFQQ(rq) == RQ_CFQQ(next);
2597 }
2598
2599 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2600 {
2601         hrtimer_try_to_cancel(&cfqd->idle_slice_timer);
2602         cfqg_stats_update_idle_time(cfqq->cfqg);
2603 }
2604
2605 static void __cfq_set_active_queue(struct cfq_data *cfqd,
2606                                    struct cfq_queue *cfqq)
2607 {
2608         if (cfqq) {
2609                 cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
2610                                 cfqd->serving_wl_class, cfqd->serving_wl_type);
2611                 cfqg_stats_update_avg_queue_size(cfqq->cfqg);
2612                 cfqq->slice_start = 0;
2613                 cfqq->dispatch_start = ktime_get_ns();
2614                 cfqq->allocated_slice = 0;
2615                 cfqq->slice_end = 0;
2616                 cfqq->slice_dispatch = 0;
2617                 cfqq->nr_sectors = 0;
2618
2619                 cfq_clear_cfqq_wait_request(cfqq);
2620                 cfq_clear_cfqq_must_dispatch(cfqq);
2621                 cfq_clear_cfqq_must_alloc_slice(cfqq);
2622                 cfq_clear_cfqq_fifo_expire(cfqq);
2623                 cfq_mark_cfqq_slice_new(cfqq);
2624
2625                 cfq_del_timer(cfqd, cfqq);
2626         }
2627
2628         cfqd->active_queue = cfqq;
2629 }
2630
2631 /*
2632  * current cfqq expired its slice (or was too idle), select new one
2633  */
2634 static void
2635 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2636                     bool timed_out)
2637 {
2638         cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
2639
2640         if (cfq_cfqq_wait_request(cfqq))
2641                 cfq_del_timer(cfqd, cfqq);
2642
2643         cfq_clear_cfqq_wait_request(cfqq);
2644         cfq_clear_cfqq_wait_busy(cfqq);
2645
2646         /*
2647          * If this cfqq is shared between multiple processes, check to
2648          * make sure that those processes are still issuing I/Os within
2649          * the mean seek distance.  If not, it may be time to break the
2650          * queues apart again.
2651          */
2652         if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
2653                 cfq_mark_cfqq_split_coop(cfqq);
2654
2655         /*
2656          * store what was left of this slice, if the queue idled/timed out
2657          */
2658         if (timed_out) {
2659                 if (cfq_cfqq_slice_new(cfqq))
2660                         cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
2661                 else
2662                         cfqq->slice_resid = cfqq->slice_end - ktime_get_ns();
2663                 cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid);
2664         }
2665
2666         cfq_group_served(cfqd, cfqq->cfqg, cfqq);
2667
2668         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
2669                 cfq_del_cfqq_rr(cfqd, cfqq);
2670
2671         cfq_resort_rr_list(cfqd, cfqq);
2672
2673         if (cfqq == cfqd->active_queue)
2674                 cfqd->active_queue = NULL;
2675
2676         if (cfqd->active_cic) {
2677                 put_io_context(cfqd->active_cic->icq.ioc);
2678                 cfqd->active_cic = NULL;
2679         }
2680 }
2681
2682 static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
2683 {
2684         struct cfq_queue *cfqq = cfqd->active_queue;
2685
2686         if (cfqq)
2687                 __cfq_slice_expired(cfqd, cfqq, timed_out);
2688 }
2689
2690 /*
2691  * Get next queue for service. Unless we have a queue preemption,
2692  * we'll simply select the first cfqq in the service tree.
2693  */
2694 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
2695 {
2696         struct cfq_rb_root *st = st_for(cfqd->serving_group,
2697                         cfqd->serving_wl_class, cfqd->serving_wl_type);
2698
2699         if (!cfqd->rq_queued)
2700                 return NULL;
2701
2702         /* There is nothing to dispatch */
2703         if (!st)
2704                 return NULL;
2705         if (RB_EMPTY_ROOT(&st->rb.rb_root))
2706                 return NULL;
2707         return cfq_rb_first(st);
2708 }
2709
2710 static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
2711 {
2712         struct cfq_group *cfqg;
2713         struct cfq_queue *cfqq;
2714         int i, j;
2715         struct cfq_rb_root *st;
2716
2717         if (!cfqd->rq_queued)
2718                 return NULL;
2719
2720         cfqg = cfq_get_next_cfqg(cfqd);
2721         if (!cfqg)
2722                 return NULL;
2723
2724         for_each_cfqg_st(cfqg, i, j, st) {
2725                 cfqq = cfq_rb_first(st);
2726                 if (cfqq)
2727                         return cfqq;
2728         }
2729         return NULL;
2730 }
2731
2732 /*
2733  * Get and set a new active queue for service.
2734  */
2735 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
2736                                               struct cfq_queue *cfqq)
2737 {
2738         if (!cfqq)
2739                 cfqq = cfq_get_next_queue(cfqd);
2740
2741         __cfq_set_active_queue(cfqd, cfqq);
2742         return cfqq;
2743 }
2744
2745 static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
2746                                           struct request *rq)
2747 {
2748         if (blk_rq_pos(rq) >= cfqd->last_position)
2749                 return blk_rq_pos(rq) - cfqd->last_position;
2750         else
2751                 return cfqd->last_position - blk_rq_pos(rq);
2752 }
2753
2754 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2755                                struct request *rq)
2756 {
2757         return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
2758 }
2759
2760 static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
2761                                     struct cfq_queue *cur_cfqq)
2762 {
2763         struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
2764         struct rb_node *parent, *node;
2765         struct cfq_queue *__cfqq;
2766         sector_t sector = cfqd->last_position;
2767
2768         if (RB_EMPTY_ROOT(root))
2769                 return NULL;
2770
2771         /*
2772          * First, if we find a request starting at the end of the last
2773          * request, choose it.
2774          */
2775         __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
2776         if (__cfqq)
2777                 return __cfqq;
2778
2779         /*
2780          * If the exact sector wasn't found, the parent of the NULL leaf
2781          * will contain the closest sector.
2782          */
2783         __cfqq = rb_entry(parent, struct cfq_queue, p_node);
2784         if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2785                 return __cfqq;
2786
2787         if (blk_rq_pos(__cfqq->next_rq) < sector)
2788                 node = rb_next(&__cfqq->p_node);
2789         else
2790                 node = rb_prev(&__cfqq->p_node);
2791         if (!node)
2792                 return NULL;
2793
2794         __cfqq = rb_entry(node, struct cfq_queue, p_node);
2795         if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2796                 return __cfqq;
2797
2798         return NULL;
2799 }
2800
2801 /*
2802  * cfqd - obvious
2803  * cur_cfqq - passed in so that we don't decide that the current queue is
2804  *            closely cooperating with itself.
2805  *
2806  * So, basically we're assuming that that cur_cfqq has dispatched at least
2807  * one request, and that cfqd->last_position reflects a position on the disk
2808  * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
2809  * assumption.
2810  */
2811 static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
2812                                               struct cfq_queue *cur_cfqq)
2813 {
2814         struct cfq_queue *cfqq;
2815
2816         if (cfq_class_idle(cur_cfqq))
2817                 return NULL;
2818         if (!cfq_cfqq_sync(cur_cfqq))
2819                 return NULL;
2820         if (CFQQ_SEEKY(cur_cfqq))
2821                 return NULL;
2822
2823         /*
2824          * Don't search priority tree if it's the only queue in the group.
2825          */
2826         if (cur_cfqq->cfqg->nr_cfqq == 1)
2827                 return NULL;
2828
2829         /*
2830          * We should notice if some of the queues are cooperating, eg
2831          * working closely on the same area of the disk. In that case,
2832          * we can group them together and don't waste time idling.
2833          */
2834         cfqq = cfqq_close(cfqd, cur_cfqq);
2835         if (!cfqq)
2836                 return NULL;
2837
2838         /* If new queue belongs to different cfq_group, don't choose it */
2839         if (cur_cfqq->cfqg != cfqq->cfqg)
2840                 return NULL;
2841
2842         /*
2843          * It only makes sense to merge sync queues.
2844          */
2845         if (!cfq_cfqq_sync(cfqq))
2846                 return NULL;
2847         if (CFQQ_SEEKY(cfqq))
2848                 return NULL;
2849
2850         /*
2851          * Do not merge queues of different priority classes
2852          */
2853         if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
2854                 return NULL;
2855
2856         return cfqq;
2857 }
2858
2859 /*
2860  * Determine whether we should enforce idle window for this queue.
2861  */
2862
2863 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2864 {
2865         enum wl_class_t wl_class = cfqq_class(cfqq);
2866         struct cfq_rb_root *st = cfqq->service_tree;
2867
2868         BUG_ON(!st);
2869         BUG_ON(!st->count);
2870
2871         if (!cfqd->cfq_slice_idle)
2872                 return false;
2873
2874         /* We never do for idle class queues. */
2875         if (wl_class == IDLE_WORKLOAD)
2876                 return false;
2877
2878         /* We do for queues that were marked with idle window flag. */
2879         if (cfq_cfqq_idle_window(cfqq) &&
2880            !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
2881                 return true;
2882
2883         /*
2884          * Otherwise, we do only if they are the last ones
2885          * in their service tree.
2886          */
2887         if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
2888            !cfq_io_thinktime_big(cfqd, &st->ttime, false))
2889                 return true;
2890         cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
2891         return false;
2892 }
2893
2894 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2895 {
2896         struct cfq_queue *cfqq = cfqd->active_queue;
2897         struct cfq_rb_root *st = cfqq->service_tree;
2898         struct cfq_io_cq *cic;
2899         u64 sl, group_idle = 0;
2900         u64 now = ktime_get_ns();
2901
2902         /*
2903          * SSD device without seek penalty, disable idling. But only do so
2904          * for devices that support queuing, otherwise we still have a problem
2905          * with sync vs async workloads.
2906          */
2907         if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag &&
2908                 !cfqd->cfq_group_idle)
2909                 return;
2910
2911         WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
2912         WARN_ON(cfq_cfqq_slice_new(cfqq));
2913
2914         /*
2915          * idle is disabled, either manually or by past process history
2916          */
2917         if (!cfq_should_idle(cfqd, cfqq)) {
2918                 /* no queue idling. Check for group idling */
2919                 if (cfqd->cfq_group_idle)
2920                         group_idle = cfqd->cfq_group_idle;
2921                 else
2922                         return;
2923         }
2924
2925         /*
2926          * still active requests from this queue, don't idle
2927          */
2928         if (cfqq->dispatched)
2929                 return;
2930
2931         /*
2932          * task has exited, don't wait
2933          */
2934         cic = cfqd->active_cic;
2935         if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
2936                 return;
2937
2938         /*
2939          * If our average think time is larger than the remaining time
2940          * slice, then don't idle. This avoids overrunning the allotted
2941          * time slice.
2942          */
2943         if (sample_valid(cic->ttime.ttime_samples) &&
2944             (cfqq->slice_end - now < cic->ttime.ttime_mean)) {
2945                 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%llu",
2946                              cic->ttime.ttime_mean);
2947                 return;
2948         }
2949
2950         /*
2951          * There are other queues in the group or this is the only group and
2952          * it has too big thinktime, don't do group idle.
2953          */
2954         if (group_idle &&
2955             (cfqq->cfqg->nr_cfqq > 1 ||
2956              cfq_io_thinktime_big(cfqd, &st->ttime, true)))
2957                 return;
2958
2959         cfq_mark_cfqq_wait_request(cfqq);
2960
2961         if (group_idle)
2962                 sl = cfqd->cfq_group_idle;
2963         else
2964                 sl = cfqd->cfq_slice_idle;
2965
2966         hrtimer_start(&cfqd->idle_slice_timer, ns_to_ktime(sl),
2967                       HRTIMER_MODE_REL);
2968         cfqg_stats_set_start_idle_time(cfqq->cfqg);
2969         cfq_log_cfqq(cfqd, cfqq, "arm_idle: %llu group_idle: %d", sl,
2970                         group_idle ? 1 : 0);
2971 }
2972
2973 /*
2974  * Move request from internal lists to the request queue dispatch list.
2975  */
2976 static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2977 {
2978         struct cfq_data *cfqd = q->elevator->elevator_data;
2979         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2980
2981         cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
2982
2983         cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
2984         cfq_remove_request(rq);
2985         cfqq->dispatched++;
2986         (RQ_CFQG(rq))->dispatched++;
2987         elv_dispatch_sort(q, rq);
2988
2989         cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
2990         cfqq->nr_sectors += blk_rq_sectors(rq);
2991 }
2992
2993 /*
2994  * return expired entry, or NULL to just start from scratch in rbtree
2995  */
2996 static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
2997 {
2998         struct request *rq = NULL;
2999
3000         if (cfq_cfqq_fifo_expire(cfqq))
3001                 return NULL;
3002
3003         cfq_mark_cfqq_fifo_expire(cfqq);
3004
3005         if (list_empty(&cfqq->fifo))
3006                 return NULL;
3007
3008         rq = rq_entry_fifo(cfqq->fifo.next);
3009         if (ktime_get_ns() < rq->fifo_time)
3010                 rq = NULL;
3011
3012         return rq;
3013 }
3014
3015 static inline int
3016 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3017 {
3018         const int base_rq = cfqd->cfq_slice_async_rq;
3019
3020         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
3021
3022         return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
3023 }
3024
3025 /*
3026  * Must be called with the queue_lock held.
3027  */
3028 static int cfqq_process_refs(struct cfq_queue *cfqq)
3029 {
3030         int process_refs, io_refs;
3031
3032         io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
3033         process_refs = cfqq->ref - io_refs;
3034         BUG_ON(process_refs < 0);
3035         return process_refs;
3036 }
3037
3038 static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
3039 {
3040         int process_refs, new_process_refs;
3041         struct cfq_queue *__cfqq;
3042
3043         /*
3044          * If there are no process references on the new_cfqq, then it is
3045          * unsafe to follow the ->new_cfqq chain as other cfqq's in the
3046          * chain may have dropped their last reference (not just their
3047          * last process reference).
3048          */
3049         if (!cfqq_process_refs(new_cfqq))
3050                 return;
3051
3052         /* Avoid a circular list and skip interim queue merges */
3053         while ((__cfqq = new_cfqq->new_cfqq)) {
3054                 if (__cfqq == cfqq)
3055                         return;
3056                 new_cfqq = __cfqq;
3057         }
3058
3059         process_refs = cfqq_process_refs(cfqq);
3060         new_process_refs = cfqq_process_refs(new_cfqq);
3061         /*
3062          * If the process for the cfqq has gone away, there is no
3063          * sense in merging the queues.
3064          */
3065         if (process_refs == 0 || new_process_refs == 0)
3066                 return;
3067
3068         /*
3069          * Merge in the direction of the lesser amount of work.
3070          */
3071         if (new_process_refs >= process_refs) {
3072                 cfqq->new_cfqq = new_cfqq;
3073                 new_cfqq->ref += process_refs;
3074         } else {
3075                 new_cfqq->new_cfqq = cfqq;
3076                 cfqq->ref += new_process_refs;
3077         }
3078 }
3079
3080 static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
3081                         struct cfq_group *cfqg, enum wl_class_t wl_class)
3082 {
3083         struct cfq_queue *queue;
3084         int i;
3085         bool key_valid = false;
3086         u64 lowest_key = 0;
3087         enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
3088
3089         for (i = 0; i <= SYNC_WORKLOAD; ++i) {
3090                 /* select the one with lowest rb_key */
3091                 queue = cfq_rb_first(st_for(cfqg, wl_class, i));
3092                 if (queue &&
3093                     (!key_valid || queue->rb_key < lowest_key)) {
3094                         lowest_key = queue->rb_key;
3095                         cur_best = i;
3096                         key_valid = true;
3097                 }
3098         }
3099
3100         return cur_best;
3101 }
3102
3103 static void
3104 choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
3105 {
3106         u64 slice;
3107         unsigned count;
3108         struct cfq_rb_root *st;
3109         u64 group_slice;
3110         enum wl_class_t original_class = cfqd->serving_wl_class;
3111         u64 now = ktime_get_ns();
3112
3113         /* Choose next priority. RT > BE > IDLE */
3114         if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
3115                 cfqd->serving_wl_class = RT_WORKLOAD;
3116         else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
3117                 cfqd->serving_wl_class = BE_WORKLOAD;
3118         else {
3119                 cfqd->serving_wl_class = IDLE_WORKLOAD;
3120                 cfqd->workload_expires = now + jiffies_to_nsecs(1);
3121                 return;
3122         }
3123
3124         if (original_class != cfqd->serving_wl_class)
3125                 goto new_workload;
3126
3127         /*
3128          * For RT and BE, we have to choose also the type
3129          * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
3130          * expiration time
3131          */
3132         st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
3133         count = st->count;
3134
3135         /*
3136          * check workload expiration, and that we still have other queues ready
3137          */
3138         if (count && !(now > cfqd->workload_expires))
3139                 return;
3140
3141 new_workload:
3142         /* otherwise select new workload type */
3143         cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
3144                                         cfqd->serving_wl_class);
3145         st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
3146         count = st->count;
3147
3148         /*
3149          * the workload slice is computed as a fraction of target latency
3150          * proportional to the number of queues in that workload, over
3151          * all the queues in the same priority class
3152          */
3153         group_slice = cfq_group_slice(cfqd, cfqg);
3154
3155         slice = div_u64(group_slice * count,
3156                 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
3157                       cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
3158                                         cfqg)));
3159
3160         if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
3161                 u64 tmp;
3162
3163                 /*
3164                  * Async queues are currently system wide. Just taking
3165                  * proportion of queues with-in same group will lead to higher
3166                  * async ratio system wide as generally root group is going
3167                  * to have higher weight. A more accurate thing would be to
3168                  * calculate system wide asnc/sync ratio.
3169                  */
3170                 tmp = cfqd->cfq_target_latency *
3171                         cfqg_busy_async_queues(cfqd, cfqg);
3172                 tmp = div_u64(tmp, cfqd->busy_queues);
3173                 slice = min_t(u64, slice, tmp);
3174
3175                 /* async workload slice is scaled down according to
3176                  * the sync/async slice ratio. */
3177                 slice = div64_u64(slice*cfqd->cfq_slice[0], cfqd->cfq_slice[1]);
3178         } else
3179                 /* sync workload slice is at least 2 * cfq_slice_idle */
3180                 slice = max(slice, 2 * cfqd->cfq_slice_idle);
3181
3182         slice = max_t(u64, slice, CFQ_MIN_TT);
3183         cfq_log(cfqd, "workload slice:%llu", slice);
3184         cfqd->workload_expires = now + slice;
3185 }
3186
3187 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
3188 {
3189         struct cfq_rb_root *st = &cfqd->grp_service_tree;
3190         struct cfq_group *cfqg;
3191
3192         if (RB_EMPTY_ROOT(&st->rb.rb_root))
3193                 return NULL;
3194         cfqg = cfq_rb_first_group(st);
3195         update_min_vdisktime(st);
3196         return cfqg;
3197 }
3198
3199 static void cfq_choose_cfqg(struct cfq_data *cfqd)
3200 {
3201         struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
3202         u64 now = ktime_get_ns();
3203
3204         cfqd->serving_group = cfqg;
3205
3206         /* Restore the workload type data */
3207         if (cfqg->saved_wl_slice) {
3208                 cfqd->workload_expires = now + cfqg->saved_wl_slice;
3209                 cfqd->serving_wl_type = cfqg->saved_wl_type;
3210                 cfqd->serving_wl_class = cfqg->saved_wl_class;
3211         } else
3212                 cfqd->workload_expires = now - 1;
3213
3214         choose_wl_class_and_type(cfqd, cfqg);
3215 }
3216
3217 /*
3218  * Select a queue for service. If we have a current active queue,
3219  * check whether to continue servicing it, or retrieve and set a new one.
3220  */
3221 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
3222 {
3223         struct cfq_queue *cfqq, *new_cfqq = NULL;
3224         u64 now = ktime_get_ns();
3225
3226         cfqq = cfqd->active_queue;
3227         if (!cfqq)
3228                 goto new_queue;
3229
3230         if (!cfqd->rq_queued)
3231                 return NULL;
3232
3233         /*
3234          * We were waiting for group to get backlogged. Expire the queue
3235          */
3236         if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
3237                 goto expire;
3238
3239         /*
3240          * The active queue has run out of time, expire it and select new.
3241          */
3242         if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
3243                 /*
3244                  * If slice had not expired at the completion of last request
3245                  * we might not have turned on wait_busy flag. Don't expire
3246                  * the queue yet. Allow the group to get backlogged.
3247                  *
3248                  * The very fact that we have used the slice, that means we
3249                  * have been idling all along on this queue and it should be
3250                  * ok to wait for this request to complete.
3251                  */
3252                 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
3253                     && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3254                         cfqq = NULL;
3255                         goto keep_queue;
3256                 } else
3257                         goto check_group_idle;
3258         }
3259
3260         /*
3261          * The active queue has requests and isn't expired, allow it to
3262          * dispatch.
3263          */
3264         if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3265                 goto keep_queue;
3266
3267         /*
3268          * If another queue has a request waiting within our mean seek
3269          * distance, let it run.  The expire code will check for close
3270          * cooperators and put the close queue at the front of the service
3271          * tree.  If possible, merge the expiring queue with the new cfqq.
3272          */
3273         new_cfqq = cfq_close_cooperator(cfqd, cfqq);
3274         if (new_cfqq) {
3275                 if (!cfqq->new_cfqq)
3276                         cfq_setup_merge(cfqq, new_cfqq);
3277                 goto expire;
3278         }
3279
3280         /*
3281          * No requests pending. If the active queue still has requests in
3282          * flight or is idling for a new request, allow either of these
3283          * conditions to happen (or time out) before selecting a new queue.
3284          */
3285         if (hrtimer_active(&cfqd->idle_slice_timer)) {
3286                 cfqq = NULL;
3287                 goto keep_queue;
3288         }
3289
3290         /*
3291          * This is a deep seek queue, but the device is much faster than
3292          * the queue can deliver, don't idle
3293          **/
3294         if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
3295             (cfq_cfqq_slice_new(cfqq) ||
3296             (cfqq->slice_end - now > now - cfqq->slice_start))) {
3297                 cfq_clear_cfqq_deep(cfqq);
3298                 cfq_clear_cfqq_idle_window(cfqq);
3299         }
3300
3301         if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3302                 cfqq = NULL;
3303                 goto keep_queue;
3304         }
3305
3306         /*
3307          * If group idle is enabled and there are requests dispatched from
3308          * this group, wait for requests to complete.
3309          */
3310 check_group_idle:
3311         if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
3312             cfqq->cfqg->dispatched &&
3313             !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
3314                 cfqq = NULL;
3315                 goto keep_queue;
3316         }
3317
3318 expire:
3319         cfq_slice_expired(cfqd, 0);
3320 new_queue:
3321         /*
3322          * Current queue expired. Check if we have to switch to a new
3323          * service tree
3324          */
3325         if (!new_cfqq)
3326                 cfq_choose_cfqg(cfqd);
3327
3328         cfqq = cfq_set_active_queue(cfqd, new_cfqq);
3329 keep_queue:
3330         return cfqq;
3331 }
3332
3333 static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
3334 {
3335         int dispatched = 0;
3336
3337         while (cfqq->next_rq) {
3338                 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
3339                 dispatched++;
3340         }
3341
3342         BUG_ON(!list_empty(&cfqq->fifo));
3343
3344         /* By default cfqq is not expired if it is empty. Do it explicitly */
3345         __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
3346         return dispatched;
3347 }
3348
3349 /*
3350  * Drain our current requests. Used for barriers and when switching
3351  * io schedulers on-the-fly.
3352  */
3353 static int cfq_forced_dispatch(struct cfq_data *cfqd)
3354 {
3355         struct cfq_queue *cfqq;
3356         int dispatched = 0;
3357
3358         /* Expire the timeslice of the current active queue first */
3359         cfq_slice_expired(cfqd, 0);
3360         while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
3361                 __cfq_set_active_queue(cfqd, cfqq);
3362                 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
3363         }
3364
3365         BUG_ON(cfqd->busy_queues);
3366
3367         cfq_log(cfqd, "forced_dispatch=%d", dispatched);
3368         return dispatched;
3369 }
3370
3371 static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
3372         struct cfq_queue *cfqq)
3373 {
3374         u64 now = ktime_get_ns();
3375
3376         /* the queue hasn't finished any request, can't estimate */
3377         if (cfq_cfqq_slice_new(cfqq))
3378                 return true;
3379         if (now + cfqd->cfq_slice_idle * cfqq->dispatched > cfqq->slice_end)
3380                 return true;
3381
3382         return false;
3383 }
3384
3385 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3386 {
3387         unsigned int max_dispatch;
3388
3389         if (cfq_cfqq_must_dispatch(cfqq))
3390                 return true;
3391
3392         /*
3393          * Drain async requests before we start sync IO
3394          */
3395         if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
3396                 return false;
3397
3398         /*
3399          * If this is an async queue and we have sync IO in flight, let it wait
3400          */
3401         if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
3402                 return false;
3403
3404         max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
3405         if (cfq_class_idle(cfqq))
3406                 max_dispatch = 1;
3407
3408         /*
3409          * Does this cfqq already have too much IO in flight?
3410          */
3411         if (cfqq->dispatched >= max_dispatch) {
3412                 bool promote_sync = false;
3413                 /*
3414                  * idle queue must always only have a single IO in flight
3415                  */
3416                 if (cfq_class_idle(cfqq))
3417                         return false;
3418
3419                 /*
3420                  * If there is only one sync queue
3421                  * we can ignore async queue here and give the sync
3422                  * queue no dispatch limit. The reason is a sync queue can
3423                  * preempt async queue, limiting the sync queue doesn't make
3424                  * sense. This is useful for aiostress test.
3425                  */
3426                 if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
3427                         promote_sync = true;
3428
3429                 /*
3430                  * We have other queues, don't allow more IO from this one
3431                  */
3432                 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
3433                                 !promote_sync)
3434                         return false;
3435
3436                 /*
3437                  * Sole queue user, no limit
3438                  */
3439                 if (cfqd->busy_queues == 1 || promote_sync)
3440                         max_dispatch = -1;
3441                 else
3442                         /*
3443                          * Normally we start throttling cfqq when cfq_quantum/2
3444                          * requests have been dispatched. But we can drive
3445                          * deeper queue depths at the beginning of slice
3446                          * subjected to upper limit of cfq_quantum.
3447                          * */
3448                         max_dispatch = cfqd->cfq_quantum;
3449         }
3450
3451         /*
3452          * Async queues must wait a bit before being allowed dispatch.
3453          * We also ramp up the dispatch depth gradually for async IO,
3454          * based on the last sync IO we serviced
3455          */
3456         if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
3457                 u64 last_sync = ktime_get_ns() - cfqd->last_delayed_sync;
3458                 unsigned int depth;
3459
3460                 depth = div64_u64(last_sync, cfqd->cfq_slice[1]);
3461                 if (!depth && !cfqq->dispatched)
3462                         depth = 1;
3463                 if (depth < max_dispatch)
3464                         max_dispatch = depth;
3465         }
3466
3467         /*
3468          * If we're below the current max, allow a dispatch
3469          */
3470         return cfqq->dispatched < max_dispatch;
3471 }
3472
3473 /*
3474  * Dispatch a request from cfqq, moving them to the request queue
3475  * dispatch list.
3476  */
3477 static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3478 {
3479         struct request *rq;
3480
3481         BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
3482
3483         rq = cfq_check_fifo(cfqq);
3484         if (rq)
3485                 cfq_mark_cfqq_must_dispatch(cfqq);
3486
3487         if (!cfq_may_dispatch(cfqd, cfqq))
3488                 return false;
3489
3490         /*
3491          * follow expired path, else get first next available
3492          */
3493         if (!rq)
3494                 rq = cfqq->next_rq;
3495         else
3496                 cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
3497
3498         /*
3499          * insert request into driver dispatch list
3500          */
3501         cfq_dispatch_insert(cfqd->queue, rq);
3502
3503         if (!cfqd->active_cic) {
3504                 struct cfq_io_cq *cic = RQ_CIC(rq);
3505
3506                 atomic_long_inc(&cic->icq.ioc->refcount);
3507                 cfqd->active_cic = cic;
3508         }
3509
3510         return true;
3511 }
3512
3513 /*
3514  * Find the cfqq that we need to service and move a request from that to the
3515  * dispatch list
3516  */
3517 static int cfq_dispatch_requests(struct request_queue *q, int force)
3518 {
3519         struct cfq_data *cfqd = q->elevator->elevator_data;
3520         struct cfq_queue *cfqq;
3521
3522         if (!cfqd->busy_queues)
3523                 return 0;
3524
3525         if (unlikely(force))
3526                 return cfq_forced_dispatch(cfqd);
3527
3528         cfqq = cfq_select_queue(cfqd);
3529         if (!cfqq)
3530                 return 0;
3531
3532         /*
3533          * Dispatch a request from this cfqq, if it is allowed
3534          */
3535         if (!cfq_dispatch_request(cfqd, cfqq))
3536                 return 0;
3537
3538         cfqq->slice_dispatch++;
3539         cfq_clear_cfqq_must_dispatch(cfqq);
3540
3541         /*
3542          * expire an async queue immediately if it has used up its slice. idle
3543          * queue always expire after 1 dispatch round.
3544          */
3545         if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
3546             cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
3547             cfq_class_idle(cfqq))) {
3548                 cfqq->slice_end = ktime_get_ns() + 1;
3549                 cfq_slice_expired(cfqd, 0);
3550         }
3551
3552         cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
3553         return 1;
3554 }
3555
3556 /*
3557  * task holds one reference to the queue, dropped when task exits. each rq
3558  * in-flight on this queue also holds a reference, dropped when rq is freed.
3559  *
3560  * Each cfq queue took a reference on the parent group. Drop it now.
3561  * queue lock must be held here.
3562  */
3563 static void cfq_put_queue(struct cfq_queue *cfqq)
3564 {
3565         struct cfq_data *cfqd = cfqq->cfqd;
3566         struct cfq_group *cfqg;
3567
3568         BUG_ON(cfqq->ref <= 0);
3569
3570         cfqq->ref--;
3571         if (cfqq->ref)
3572                 return;
3573
3574         cfq_log_cfqq(cfqd, cfqq, "put_queue");
3575         BUG_ON(rb_first(&cfqq->sort_list));
3576         BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
3577         cfqg = cfqq->cfqg;
3578
3579         if (unlikely(cfqd->active_queue == cfqq)) {
3580                 __cfq_slice_expired(cfqd, cfqq, 0);
3581                 cfq_schedule_dispatch(cfqd);
3582         }
3583
3584         BUG_ON(cfq_cfqq_on_rr(cfqq));
3585         kmem_cache_free(cfq_pool, cfqq);
3586         cfqg_put(cfqg);
3587 }
3588
3589 static void cfq_put_cooperator(struct cfq_queue *cfqq)
3590 {
3591         struct cfq_queue *__cfqq, *next;
3592
3593         /*
3594          * If this queue was scheduled to merge with another queue, be
3595          * sure to drop the reference taken on that queue (and others in
3596          * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
3597          */
3598         __cfqq = cfqq->new_cfqq;
3599         while (__cfqq) {
3600                 if (__cfqq == cfqq) {
3601                         WARN(1, "cfqq->new_cfqq loop detected\n");
3602                         break;
3603                 }
3604                 next = __cfqq->new_cfqq;
3605                 cfq_put_queue(__cfqq);
3606                 __cfqq = next;
3607         }
3608 }
3609
3610 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3611 {
3612         if (unlikely(cfqq == cfqd->active_queue)) {
3613                 __cfq_slice_expired(cfqd, cfqq, 0);
3614                 cfq_schedule_dispatch(cfqd);
3615         }
3616
3617         cfq_put_cooperator(cfqq);
3618
3619         cfq_put_queue(cfqq);
3620 }
3621
3622 static void cfq_init_icq(struct io_cq *icq)
3623 {
3624         struct cfq_io_cq *cic = icq_to_cic(icq);
3625
3626         cic->ttime.last_end_request = ktime_get_ns();
3627 }
3628
3629 static void cfq_exit_icq(struct io_cq *icq)
3630 {
3631         struct cfq_io_cq *cic = icq_to_cic(icq);
3632         struct cfq_data *cfqd = cic_to_cfqd(cic);
3633
3634         if (cic_to_cfqq(cic, false)) {
3635                 cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
3636                 cic_set_cfqq(cic, NULL, false);
3637         }
3638
3639         if (cic_to_cfqq(cic, true)) {
3640                 cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
3641                 cic_set_cfqq(cic, NULL, true);
3642         }
3643 }
3644
3645 static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
3646 {
3647         struct task_struct *tsk = current;
3648         int ioprio_class;
3649
3650         if (!cfq_cfqq_prio_changed(cfqq))
3651                 return;
3652
3653         ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3654         switch (ioprio_class) {
3655         default:
3656                 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
3657         case IOPRIO_CLASS_NONE:
3658                 /*
3659                  * no prio set, inherit CPU scheduling settings
3660                  */
3661                 cfqq->ioprio = task_nice_ioprio(tsk);
3662                 cfqq->ioprio_class = task_nice_ioclass(tsk);
3663                 break;
3664         case IOPRIO_CLASS_RT:
3665                 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3666                 cfqq->ioprio_class = IOPRIO_CLASS_RT;
3667                 break;
3668         case IOPRIO_CLASS_BE:
3669                 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3670                 cfqq->ioprio_class = IOPRIO_CLASS_BE;
3671                 break;
3672         case IOPRIO_CLASS_IDLE:
3673                 cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
3674                 cfqq->ioprio = 7;
3675                 cfq_clear_cfqq_idle_window(cfqq);
3676                 break;
3677         }
3678
3679         /*
3680          * keep track of original prio settings in case we have to temporarily
3681          * elevate the priority of this queue
3682          */
3683         cfqq->org_ioprio = cfqq->ioprio;
3684         cfqq->org_ioprio_class = cfqq->ioprio_class;
3685         cfq_clear_cfqq_prio_changed(cfqq);
3686 }
3687
3688 static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
3689 {
3690         int ioprio = cic->icq.ioc->ioprio;
3691         struct cfq_data *cfqd = cic_to_cfqd(cic);
3692         struct cfq_queue *cfqq;
3693
3694         /*
3695          * Check whether ioprio has changed.  The condition may trigger
3696          * spuriously on a newly created cic but there's no harm.
3697          */
3698         if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
3699                 return;
3700
3701         cfqq = cic_to_cfqq(cic, false);
3702         if (cfqq) {
3703                 cfq_put_queue(cfqq);
3704                 cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
3705                 cic_set_cfqq(cic, cfqq, false);
3706         }
3707
3708         cfqq = cic_to_cfqq(cic, true);
3709         if (cfqq)
3710                 cfq_mark_cfqq_prio_changed(cfqq);
3711
3712         cic->ioprio = ioprio;
3713 }
3714
3715 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3716                           pid_t pid, bool is_sync)
3717 {
3718         RB_CLEAR_NODE(&cfqq->rb_node);
3719         RB_CLEAR_NODE(&cfqq->p_node);
3720         INIT_LIST_HEAD(&cfqq->fifo);
3721
3722         cfqq->ref = 0;
3723         cfqq->cfqd = cfqd;
3724
3725         cfq_mark_cfqq_prio_changed(cfqq);
3726
3727         if (is_sync) {
3728                 if (!cfq_class_idle(cfqq))
3729                         cfq_mark_cfqq_idle_window(cfqq);
3730                 cfq_mark_cfqq_sync(cfqq);
3731         }
3732         cfqq->pid = pid;
3733 }
3734
3735 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3736 static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3737 {
3738         struct cfq_data *cfqd = cic_to_cfqd(cic);
3739         struct cfq_queue *cfqq;
3740         uint64_t serial_nr;
3741
3742         rcu_read_lock();
3743         serial_nr = bio_blkcg(bio)->css.serial_nr;
3744         rcu_read_unlock();
3745
3746         /*
3747          * Check whether blkcg has changed.  The condition may trigger
3748          * spuriously on a newly created cic but there's no harm.
3749          */
3750         if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
3751                 return;
3752
3753         /*
3754          * Drop reference to queues.  New queues will be assigned in new
3755          * group upon arrival of fresh requests.
3756          */
3757         cfqq = cic_to_cfqq(cic, false);
3758         if (cfqq) {
3759                 cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3760                 cic_set_cfqq(cic, NULL, false);
3761                 cfq_put_queue(cfqq);
3762         }
3763
3764         cfqq = cic_to_cfqq(cic, true);
3765         if (cfqq) {
3766                 cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3767                 cic_set_cfqq(cic, NULL, true);
3768                 cfq_put_queue(cfqq);
3769         }
3770
3771         cic->blkcg_serial_nr = serial_nr;
3772 }
3773 #else
3774 static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3775 {
3776 }
3777 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
3778
3779 static struct cfq_queue **
3780 cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
3781 {
3782         switch (ioprio_class) {
3783         case IOPRIO_CLASS_RT:
3784                 return &cfqg->async_cfqq[0][ioprio];
3785         case IOPRIO_CLASS_NONE:
3786                 ioprio = IOPRIO_NORM;
3787                 /* fall through */
3788         case IOPRIO_CLASS_BE:
3789                 return &cfqg->async_cfqq[1][ioprio];
3790         case IOPRIO_CLASS_IDLE:
3791                 return &cfqg->async_idle_cfqq;
3792         default:
3793                 BUG();
3794         }
3795 }
3796
3797 static struct cfq_queue *
3798 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3799               struct bio *bio)
3800 {
3801         int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3802         int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3803         struct cfq_queue **async_cfqq = NULL;
3804         struct cfq_queue *cfqq;
3805         struct cfq_group *cfqg;
3806
3807         rcu_read_lock();
3808         cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
3809         if (!cfqg) {
3810                 cfqq = &cfqd->oom_cfqq;
3811                 goto out;
3812         }
3813
3814         if (!is_sync) {
3815                 if (!ioprio_valid(cic->ioprio)) {
3816                         struct task_struct *tsk = current;
3817                         ioprio = task_nice_ioprio(tsk);
3818                         ioprio_class = task_nice_ioclass(tsk);
3819                 }
3820                 async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
3821                 cfqq = *async_cfqq;
3822                 if (cfqq)
3823                         goto out;
3824         }
3825
3826         cfqq = kmem_cache_alloc_node(cfq_pool,
3827                                      GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
3828                                      cfqd->queue->node);
3829         if (!cfqq) {
3830                 cfqq = &cfqd->oom_cfqq;
3831                 goto out;
3832         }
3833
3834         /* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */
3835         cfqq->ioprio_class = IOPRIO_CLASS_NONE;
3836         cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
3837         cfq_init_prio_data(cfqq, cic);
3838         cfq_link_cfqq_cfqg(cfqq, cfqg);
3839         cfq_log_cfqq(cfqd, cfqq, "alloced");
3840
3841         if (async_cfqq) {
3842                 /* a new async queue is created, pin and remember */
3843                 cfqq->ref++;
3844                 *async_cfqq = cfqq;
3845         }
3846 out:
3847         cfqq->ref++;
3848         rcu_read_unlock();
3849         return cfqq;
3850 }
3851
3852 static void
3853 __cfq_update_io_thinktime(struct cfq_ttime *ttime, u64 slice_idle)
3854 {
3855         u64 elapsed = ktime_get_ns() - ttime->last_end_request;
3856         elapsed = min(elapsed, 2UL * slice_idle);
3857
3858         ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
3859         ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed,  8);
3860         ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
3861                                      ttime->ttime_samples);
3862 }
3863
3864 static void
3865 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3866                         struct cfq_io_cq *cic)
3867 {
3868         if (cfq_cfqq_sync(cfqq)) {
3869                 __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
3870                 __cfq_update_io_thinktime(&cfqq->service_tree->ttime,
3871                         cfqd->cfq_slice_idle);
3872         }
3873 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3874         __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
3875 #endif
3876 }
3877
3878 static void
3879 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3880                        struct request *rq)
3881 {
3882         sector_t sdist = 0;
3883         sector_t n_sec = blk_rq_sectors(rq);
3884         if (cfqq->last_request_pos) {
3885                 if (cfqq->last_request_pos < blk_rq_pos(rq))
3886                         sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3887                 else
3888                         sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3889         }
3890
3891         cfqq->seek_history <<= 1;
3892         if (blk_queue_nonrot(cfqd->queue))
3893                 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3894         else
3895                 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3896 }
3897
3898 static inline bool req_noidle(struct request *req)
3899 {
3900         return req_op(req) == REQ_OP_WRITE &&
3901                 (req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC;
3902 }
3903
3904 /*
3905  * Disable idle window if the process thinks too long or seeks so much that
3906  * it doesn't matter
3907  */
3908 static void
3909 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3910                        struct cfq_io_cq *cic)
3911 {
3912         int old_idle, enable_idle;
3913
3914         /*
3915          * Don't idle for async or idle io prio class
3916          */
3917         if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3918                 return;
3919
3920         enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3921
3922         if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3923                 cfq_mark_cfqq_deep(cfqq);
3924
3925         if (cfqq->next_rq && req_noidle(cfqq->next_rq))
3926                 enable_idle = 0;
3927         else if (!atomic_read(&cic->icq.ioc->active_ref) ||