1 // SPDX-License-Identifier: GPL-2.0
3 * Historical Service Time
5 * Keeps a time-weighted exponential moving average of the historical
6 * service time. Estimates future service time based on the historical
7 * service time and the number of outstanding requests.
9 * Marks paths stale if they have not finished within hst *
10 * num_paths. If a path is stale and unused, we will send a single
11 * request to probe in case the path has improved. This situation
12 * generally arises if the path is so much worse than others that it
13 * will never have the best estimated service time, or if the entire
14 * multipath device is unused. If a path is stale and in use, limit the
15 * number of requests it can receive with the assumption that the path
16 * has become degraded.
18 * To avoid repeatedly calculating exponents for time weighting, times
19 * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
20 * ns, and the weighting is pre-calculated.
25 #include "dm-path-selector.h"
27 #include <linux/blkdev.h>
28 #include <linux/slab.h>
29 #include <linux/module.h>
30 #include <linux/sched/clock.h>
33 #define DM_MSG_PREFIX "multipath historical-service-time"
35 #define HST_VERSION "0.1.1"
37 #define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */
38 #define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
39 #define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
40 #define HST_FIXED_95 972
41 #define HST_MAX_INFLIGHT HST_FIXED_1
42 #define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
43 #define HST_WEIGHT_COUNT 64ULL
46 struct list_head valid_paths;
47 struct list_head failed_paths;
51 unsigned int weights[HST_WEIGHT_COUNT];
52 unsigned int threshold_multiplier;
56 struct list_head list;
58 unsigned int repeat_count;
62 u64 historical_service_time; /* Fixed point */
71 * fixed_power - compute: x^n, in O(log n) time
73 * @x: base of the power
74 * @frac_bits: fractional bits of @x
75 * @n: power to raise @x to.
77 * By exploiting the relation between the definition of the natural power
78 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
79 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
80 * (where: n_i \elem {0, 1}, the binary vector representing n),
81 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
82 * of course trivially computable in O(log_2 n), the length of our binary
85 * (see: kernel/sched/loadavg.c)
87 static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
89 unsigned long result = 1UL << frac_bits;
95 result += 1UL << (frac_bits - 1);
102 x += 1UL << (frac_bits - 1);
111 * Calculate the next value of an exponential moving average
112 * a_1 = a_0 * e + a * (1 - e)
114 * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
115 * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
116 * @weight: [0, HST_FIXED_1]
119 * To account for multiple periods in the same calculation,
120 * a_n = a_0 * e^n + a * (1 - e^n),
121 * so call fixed_ema(last, next, pow(weight, N))
123 static u64 fixed_ema(u64 last, u64 next, u64 weight)
126 last += next * (HST_FIXED_1 - weight);
127 last += 1ULL << (HST_FIXED_SHIFT - 1);
128 return last >> HST_FIXED_SHIFT;
131 static struct selector *alloc_selector(void)
133 struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
136 INIT_LIST_HEAD(&s->valid_paths);
137 INIT_LIST_HEAD(&s->failed_paths);
138 spin_lock_init(&s->lock);
146 * Get the weight for a given time span.
148 static u64 hst_weight(struct path_selector *ps, u64 delta)
150 struct selector *s = ps->context;
151 int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
152 HST_WEIGHT_COUNT - 1);
154 return s->weights[bucket];
158 * Set up the weights array.
161 * weights[n] = base ^ (n + 1)
163 static void hst_set_weights(struct path_selector *ps, unsigned int base)
165 struct selector *s = ps->context;
168 if (base >= HST_FIXED_1)
171 for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
172 s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
173 s->weights[HST_WEIGHT_COUNT - 1] = 0;
176 static int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
179 unsigned int base_weight = HST_FIXED_95;
180 unsigned int threshold_multiplier = 0;
184 * Arguments: [<base_weight> [<threshold_multiplier>]]
185 * <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
186 * value of 0 will completely ignore any history.
187 * If not given, default (HST_FIXED_95) is used.
188 * <threshold_multiplier>: Minimum threshold multiplier for paths to
189 * be considered different. That is, a path is
190 * considered different iff (p1 > N * p2) where p1
191 * is the path with higher service time. A threshold
192 * of 1 or 0 has no effect. Defaults to 0.
197 if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
198 base_weight >= HST_FIXED_1)) {
202 if (argc > 1 && (sscanf(argv[1], "%u%c",
203 &threshold_multiplier, &dummy) != 1)) {
207 s = alloc_selector();
213 hst_set_weights(ps, base_weight);
214 s->threshold_multiplier = threshold_multiplier;
218 static void free_paths(struct list_head *paths)
220 struct path_info *pi, *next;
222 list_for_each_entry_safe(pi, next, paths, list) {
228 static void hst_destroy(struct path_selector *ps)
230 struct selector *s = ps->context;
232 free_paths(&s->valid_paths);
233 free_paths(&s->failed_paths);
238 static int hst_status(struct path_selector *ps, struct dm_path *path,
239 status_type_t type, char *result, unsigned int maxlen)
242 struct path_info *pi;
245 struct selector *s = ps->context;
247 DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
249 pi = path->pscontext;
252 case STATUSTYPE_INFO:
253 DMEMIT("%llu %llu %llu ", pi->historical_service_time,
254 pi->outstanding, pi->stale_after);
256 case STATUSTYPE_TABLE:
268 static int hst_add_path(struct path_selector *ps, struct dm_path *path,
269 int argc, char **argv, char **error)
271 struct selector *s = ps->context;
272 struct path_info *pi;
273 unsigned int repeat_count = HST_MIN_IO;
278 * Arguments: [<repeat_count>]
279 * <repeat_count>: The number of I/Os before switching path.
280 * If not given, default (HST_MIN_IO) is used.
283 *error = "historical-service-time ps: incorrect number of arguments";
287 if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
288 *error = "historical-service-time ps: invalid repeat count";
292 /* allocate the path */
293 pi = kmalloc(sizeof(*pi), GFP_KERNEL);
295 *error = "historical-service-time ps: Error allocating path context";
300 pi->repeat_count = repeat_count;
302 pi->historical_service_time = HST_FIXED_1;
304 spin_lock_init(&pi->lock);
310 path->pscontext = pi;
312 spin_lock_irqsave(&s->lock, flags);
313 list_add_tail(&pi->list, &s->valid_paths);
315 spin_unlock_irqrestore(&s->lock, flags);
320 static void hst_fail_path(struct path_selector *ps, struct dm_path *path)
322 struct selector *s = ps->context;
323 struct path_info *pi = path->pscontext;
326 spin_lock_irqsave(&s->lock, flags);
327 list_move(&pi->list, &s->failed_paths);
329 spin_unlock_irqrestore(&s->lock, flags);
332 static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
334 struct selector *s = ps->context;
335 struct path_info *pi = path->pscontext;
338 spin_lock_irqsave(&s->lock, flags);
339 list_move_tail(&pi->list, &s->valid_paths);
341 spin_unlock_irqrestore(&s->lock, flags);
346 static void hst_fill_compare(struct path_info *pi, u64 *hst,
347 u64 *out, u64 *stale)
351 spin_lock_irqsave(&pi->lock, flags);
352 *hst = pi->historical_service_time;
353 *out = pi->outstanding;
354 *stale = pi->stale_after;
355 spin_unlock_irqrestore(&pi->lock, flags);
359 * Compare the estimated service time of 2 paths, pi1 and pi2,
360 * for the incoming I/O.
363 * < 0 : pi1 is better
364 * 0 : no difference between pi1 and pi2
365 * > 0 : pi2 is better
368 static long long hst_compare(struct path_info *pi1, struct path_info *pi2,
369 u64 time_now, struct path_selector *ps)
371 struct selector *s = ps->context;
373 long long out1, out2, stale1, stale2;
374 int pi2_better, over_threshold;
376 hst_fill_compare(pi1, &hst1, &out1, &stale1);
377 hst_fill_compare(pi2, &hst2, &out2, &stale2);
379 /* Check here if estimated latency for two paths are too similar.
380 * If this is the case, we skip extra calculation and just compare
381 * outstanding requests. In this case, any unloaded paths will
385 over_threshold = hst1 > (s->threshold_multiplier * hst2);
387 over_threshold = hst2 > (s->threshold_multiplier * hst1);
393 * If an unloaded path is stale, choose it. If both paths are unloaded,
394 * choose path that is the most stale.
395 * (If one path is loaded, choose the other)
397 if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
399 return (!out2 * stale1) - (!out1 * stale2);
401 /* Compare estimated service time. If outstanding is the same, we
402 * don't need to multiply
405 pi2_better = hst1 > hst2;
407 /* Potential overflow with out >= 1024 */
408 if (unlikely(out1 >= HST_MAX_INFLIGHT ||
409 out2 >= HST_MAX_INFLIGHT)) {
410 /* If over 1023 in-flights, we may overflow if hst
411 * is at max. (With this shift we still overflow at
412 * 1048576 in-flights, which is high enough).
414 hst1 >>= HST_FIXED_SHIFT;
415 hst2 >>= HST_FIXED_SHIFT;
417 pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
420 /* In the case that the 'winner' is stale, limit to equal usage. */
422 if (stale2 < time_now)
426 if (stale1 < time_now)
431 static struct dm_path *hst_select_path(struct path_selector *ps,
434 struct selector *s = ps->context;
435 struct path_info *pi = NULL, *best = NULL;
436 u64 time_now = sched_clock();
437 struct dm_path *ret = NULL;
440 spin_lock_irqsave(&s->lock, flags);
441 if (list_empty(&s->valid_paths))
444 list_for_each_entry(pi, &s->valid_paths, list) {
445 if (!best || (hst_compare(pi, best, time_now, ps) < 0))
452 /* Move last used path to end (least preferred in case of ties) */
453 list_move_tail(&best->list, &s->valid_paths);
458 spin_unlock_irqrestore(&s->lock, flags);
462 static int hst_start_io(struct path_selector *ps, struct dm_path *path,
465 struct path_info *pi = path->pscontext;
468 spin_lock_irqsave(&pi->lock, flags);
470 spin_unlock_irqrestore(&pi->lock, flags);
475 static u64 path_service_time(struct path_info *pi, u64 start_time)
477 u64 sched_now = ktime_get_ns();
479 /* if a previous disk request has finished after this IO was
480 * sent to the hardware, pretend the submission happened
483 if (time_after64(pi->last_finish, start_time))
484 start_time = pi->last_finish;
486 pi->last_finish = sched_now;
487 if (time_before64(sched_now, start_time))
490 return sched_now - start_time;
493 static int hst_end_io(struct path_selector *ps, struct dm_path *path,
494 size_t nr_bytes, u64 start_time)
496 struct path_info *pi = path->pscontext;
497 struct selector *s = ps->context;
501 spin_lock_irqsave(&pi->lock, flags);
503 st = path_service_time(pi, start_time);
505 pi->historical_service_time =
506 fixed_ema(pi->historical_service_time,
507 min(st * HST_FIXED_1, HST_FIXED_MAX),
511 * On request end, mark path as fresh. If a path hasn't
512 * finished any requests within the fresh period, the estimated
513 * service time is considered too optimistic and we limit the
514 * maximum requests on that path.
516 pi->stale_after = pi->last_finish +
517 (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
519 spin_unlock_irqrestore(&pi->lock, flags);
524 static struct path_selector_type hst_ps = {
525 .name = "historical-service-time",
526 .module = THIS_MODULE,
529 .create = hst_create,
530 .destroy = hst_destroy,
531 .status = hst_status,
532 .add_path = hst_add_path,
533 .fail_path = hst_fail_path,
534 .reinstate_path = hst_reinstate_path,
535 .select_path = hst_select_path,
536 .start_io = hst_start_io,
537 .end_io = hst_end_io,
540 static int __init dm_hst_init(void)
542 int r = dm_register_path_selector(&hst_ps);
545 DMERR("register failed %d", r);
547 DMINFO("version " HST_VERSION " loaded");
552 static void __exit dm_hst_exit(void)
554 int r = dm_unregister_path_selector(&hst_ps);
557 DMERR("unregister failed %d", r);
560 module_init(dm_hst_init);
561 module_exit(dm_hst_exit);
563 MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
564 MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>");
565 MODULE_LICENSE("GPL");