xsk: Fix possible crash when multiple sockets are created
[sfrench/cifs-2.6.git] / drivers / md / dm-ps-historical-service-time.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Historical Service Time
4  *
5  *  Keeps a time-weighted exponential moving average of the historical
6  *  service time. Estimates future service time based on the historical
7  *  service time and the number of outstanding requests.
8  *
9  *  Marks paths stale if they have not finished within hst *
10  *  num_paths. If a path is stale and unused, we will send a single
11  *  request to probe in case the path has improved. This situation
12  *  generally arises if the path is so much worse than others that it
13  *  will never have the best estimated service time, or if the entire
14  *  multipath device is unused. If a path is stale and in use, limit the
15  *  number of requests it can receive with the assumption that the path
16  *  has become degraded.
17  *
18  *  To avoid repeatedly calculating exponents for time weighting, times
19  *  are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
20  *  ns, and the weighting is pre-calculated.
21  *
22  */
23
24 #include "dm.h"
25 #include "dm-path-selector.h"
26
27 #include <linux/blkdev.h>
28 #include <linux/slab.h>
29 #include <linux/module.h>
30 #include <linux/sched/clock.h>
31
32
33 #define DM_MSG_PREFIX   "multipath historical-service-time"
34 #define HST_MIN_IO 1
35 #define HST_VERSION "0.1.1"
36
37 #define HST_FIXED_SHIFT 10  /* 10 bits of decimal precision */
38 #define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
39 #define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
40 #define HST_FIXED_95 972
41 #define HST_MAX_INFLIGHT HST_FIXED_1
42 #define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
43 #define HST_WEIGHT_COUNT 64ULL
44
45 struct selector {
46         struct list_head valid_paths;
47         struct list_head failed_paths;
48         int valid_count;
49         spinlock_t lock;
50
51         unsigned int weights[HST_WEIGHT_COUNT];
52         unsigned int threshold_multiplier;
53 };
54
55 struct path_info {
56         struct list_head list;
57         struct dm_path *path;
58         unsigned int repeat_count;
59
60         spinlock_t lock;
61
62         u64 historical_service_time; /* Fixed point */
63
64         u64 stale_after;
65         u64 last_finish;
66
67         u64 outstanding;
68 };
69
70 /**
71  * fixed_power - compute: x^n, in O(log n) time
72  *
73  * @x:         base of the power
74  * @frac_bits: fractional bits of @x
75  * @n:         power to raise @x to.
76  *
77  * By exploiting the relation between the definition of the natural power
78  * function: x^n := x*x*...*x (x multiplied by itself for n times), and
79  * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
80  * (where: n_i \elem {0, 1}, the binary vector representing n),
81  * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
82  * of course trivially computable in O(log_2 n), the length of our binary
83  * vector.
84  *
85  * (see: kernel/sched/loadavg.c)
86  */
87 static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
88 {
89         unsigned long result = 1UL << frac_bits;
90
91         if (n) {
92                 for (;;) {
93                         if (n & 1) {
94                                 result *= x;
95                                 result += 1UL << (frac_bits - 1);
96                                 result >>= frac_bits;
97                         }
98                         n >>= 1;
99                         if (!n)
100                                 break;
101                         x *= x;
102                         x += 1UL << (frac_bits - 1);
103                         x >>= frac_bits;
104                 }
105         }
106
107         return result;
108 }
109
110 /*
111  * Calculate the next value of an exponential moving average
112  * a_1 = a_0 * e + a * (1 - e)
113  *
114  * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
115  * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
116  * @weight: [0, HST_FIXED_1]
117  *
118  * Note:
119  *   To account for multiple periods in the same calculation,
120  *   a_n = a_0 * e^n + a * (1 - e^n),
121  *   so call fixed_ema(last, next, pow(weight, N))
122  */
123 static u64 fixed_ema(u64 last, u64 next, u64 weight)
124 {
125         last *= weight;
126         last += next * (HST_FIXED_1 - weight);
127         last += 1ULL << (HST_FIXED_SHIFT - 1);
128         return last >> HST_FIXED_SHIFT;
129 }
130
131 static struct selector *alloc_selector(void)
132 {
133         struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
134
135         if (s) {
136                 INIT_LIST_HEAD(&s->valid_paths);
137                 INIT_LIST_HEAD(&s->failed_paths);
138                 spin_lock_init(&s->lock);
139                 s->valid_count = 0;
140         }
141
142         return s;
143 }
144
145 /*
146  * Get the weight for a given time span.
147  */
148 static u64 hst_weight(struct path_selector *ps, u64 delta)
149 {
150         struct selector *s = ps->context;
151         int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
152                            HST_WEIGHT_COUNT - 1);
153
154         return s->weights[bucket];
155 }
156
157 /*
158  * Set up the weights array.
159  *
160  * weights[len-1] = 0
161  * weights[n] = base ^ (n + 1)
162  */
163 static void hst_set_weights(struct path_selector *ps, unsigned int base)
164 {
165         struct selector *s = ps->context;
166         int i;
167
168         if (base >= HST_FIXED_1)
169                 return;
170
171         for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
172                 s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
173         s->weights[HST_WEIGHT_COUNT - 1] = 0;
174 }
175
176 static int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
177 {
178         struct selector *s;
179         unsigned int base_weight = HST_FIXED_95;
180         unsigned int threshold_multiplier = 0;
181         char dummy;
182
183         /*
184          * Arguments: [<base_weight> [<threshold_multiplier>]]
185          *   <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
186          *                  value of 0 will completely ignore any history.
187          *                  If not given, default (HST_FIXED_95) is used.
188          *   <threshold_multiplier>: Minimum threshold multiplier for paths to
189          *                  be considered different. That is, a path is
190          *                  considered different iff (p1 > N * p2) where p1
191          *                  is the path with higher service time. A threshold
192          *                  of 1 or 0 has no effect. Defaults to 0.
193          */
194         if (argc > 2)
195                 return -EINVAL;
196
197         if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
198              base_weight >= HST_FIXED_1)) {
199                 return -EINVAL;
200         }
201
202         if (argc > 1 && (sscanf(argv[1], "%u%c",
203                                 &threshold_multiplier, &dummy) != 1)) {
204                 return -EINVAL;
205         }
206
207         s = alloc_selector();
208         if (!s)
209                 return -ENOMEM;
210
211         ps->context = s;
212
213         hst_set_weights(ps, base_weight);
214         s->threshold_multiplier = threshold_multiplier;
215         return 0;
216 }
217
218 static void free_paths(struct list_head *paths)
219 {
220         struct path_info *pi, *next;
221
222         list_for_each_entry_safe(pi, next, paths, list) {
223                 list_del(&pi->list);
224                 kfree(pi);
225         }
226 }
227
228 static void hst_destroy(struct path_selector *ps)
229 {
230         struct selector *s = ps->context;
231
232         free_paths(&s->valid_paths);
233         free_paths(&s->failed_paths);
234         kfree(s);
235         ps->context = NULL;
236 }
237
238 static int hst_status(struct path_selector *ps, struct dm_path *path,
239                      status_type_t type, char *result, unsigned int maxlen)
240 {
241         unsigned int sz = 0;
242         struct path_info *pi;
243
244         if (!path) {
245                 struct selector *s = ps->context;
246
247                 DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
248         } else {
249                 pi = path->pscontext;
250
251                 switch (type) {
252                 case STATUSTYPE_INFO:
253                         DMEMIT("%llu %llu %llu ", pi->historical_service_time,
254                                pi->outstanding, pi->stale_after);
255                         break;
256                 case STATUSTYPE_TABLE:
257                         DMEMIT("0 ");
258                         break;
259                 case STATUSTYPE_IMA:
260                         *result = '\0';
261                         break;
262                 }
263         }
264
265         return sz;
266 }
267
268 static int hst_add_path(struct path_selector *ps, struct dm_path *path,
269                        int argc, char **argv, char **error)
270 {
271         struct selector *s = ps->context;
272         struct path_info *pi;
273         unsigned int repeat_count = HST_MIN_IO;
274         char dummy;
275         unsigned long flags;
276
277         /*
278          * Arguments: [<repeat_count>]
279          *   <repeat_count>: The number of I/Os before switching path.
280          *                   If not given, default (HST_MIN_IO) is used.
281          */
282         if (argc > 1) {
283                 *error = "historical-service-time ps: incorrect number of arguments";
284                 return -EINVAL;
285         }
286
287         if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
288                 *error = "historical-service-time ps: invalid repeat count";
289                 return -EINVAL;
290         }
291
292         /* allocate the path */
293         pi = kmalloc(sizeof(*pi), GFP_KERNEL);
294         if (!pi) {
295                 *error = "historical-service-time ps: Error allocating path context";
296                 return -ENOMEM;
297         }
298
299         pi->path = path;
300         pi->repeat_count = repeat_count;
301
302         pi->historical_service_time = HST_FIXED_1;
303
304         spin_lock_init(&pi->lock);
305         pi->outstanding = 0;
306
307         pi->stale_after = 0;
308         pi->last_finish = 0;
309
310         path->pscontext = pi;
311
312         spin_lock_irqsave(&s->lock, flags);
313         list_add_tail(&pi->list, &s->valid_paths);
314         s->valid_count++;
315         spin_unlock_irqrestore(&s->lock, flags);
316
317         return 0;
318 }
319
320 static void hst_fail_path(struct path_selector *ps, struct dm_path *path)
321 {
322         struct selector *s = ps->context;
323         struct path_info *pi = path->pscontext;
324         unsigned long flags;
325
326         spin_lock_irqsave(&s->lock, flags);
327         list_move(&pi->list, &s->failed_paths);
328         s->valid_count--;
329         spin_unlock_irqrestore(&s->lock, flags);
330 }
331
332 static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
333 {
334         struct selector *s = ps->context;
335         struct path_info *pi = path->pscontext;
336         unsigned long flags;
337
338         spin_lock_irqsave(&s->lock, flags);
339         list_move_tail(&pi->list, &s->valid_paths);
340         s->valid_count++;
341         spin_unlock_irqrestore(&s->lock, flags);
342
343         return 0;
344 }
345
346 static void hst_fill_compare(struct path_info *pi, u64 *hst,
347                              u64 *out, u64 *stale)
348 {
349         unsigned long flags;
350
351         spin_lock_irqsave(&pi->lock, flags);
352         *hst = pi->historical_service_time;
353         *out = pi->outstanding;
354         *stale = pi->stale_after;
355         spin_unlock_irqrestore(&pi->lock, flags);
356 }
357
358 /*
359  * Compare the estimated service time of 2 paths, pi1 and pi2,
360  * for the incoming I/O.
361  *
362  * Returns:
363  * < 0 : pi1 is better
364  * 0   : no difference between pi1 and pi2
365  * > 0 : pi2 is better
366  *
367  */
368 static long long hst_compare(struct path_info *pi1, struct path_info *pi2,
369                              u64 time_now, struct path_selector *ps)
370 {
371         struct selector *s = ps->context;
372         u64 hst1, hst2;
373         long long out1, out2, stale1, stale2;
374         int pi2_better, over_threshold;
375
376         hst_fill_compare(pi1, &hst1, &out1, &stale1);
377         hst_fill_compare(pi2, &hst2, &out2, &stale2);
378
379         /* Check here if estimated latency for two paths are too similar.
380          * If this is the case, we skip extra calculation and just compare
381          * outstanding requests. In this case, any unloaded paths will
382          * be preferred.
383          */
384         if (hst1 > hst2)
385                 over_threshold = hst1 > (s->threshold_multiplier * hst2);
386         else
387                 over_threshold = hst2 > (s->threshold_multiplier * hst1);
388
389         if (!over_threshold)
390                 return out1 - out2;
391
392         /*
393          * If an unloaded path is stale, choose it. If both paths are unloaded,
394          * choose path that is the most stale.
395          * (If one path is loaded, choose the other)
396          */
397         if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
398             (!out1 && !out2))
399                 return (!out2 * stale1) - (!out1 * stale2);
400
401         /* Compare estimated service time. If outstanding is the same, we
402          * don't need to multiply
403          */
404         if (out1 == out2) {
405                 pi2_better = hst1 > hst2;
406         } else {
407                 /* Potential overflow with out >= 1024 */
408                 if (unlikely(out1 >= HST_MAX_INFLIGHT ||
409                              out2 >= HST_MAX_INFLIGHT)) {
410                         /* If over 1023 in-flights, we may overflow if hst
411                          * is at max. (With this shift we still overflow at
412                          * 1048576 in-flights, which is high enough).
413                          */
414                         hst1 >>= HST_FIXED_SHIFT;
415                         hst2 >>= HST_FIXED_SHIFT;
416                 }
417                 pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
418         }
419
420         /* In the case that the 'winner' is stale, limit to equal usage. */
421         if (pi2_better) {
422                 if (stale2 < time_now)
423                         return out1 - out2;
424                 return 1;
425         }
426         if (stale1 < time_now)
427                 return out1 - out2;
428         return -1;
429 }
430
431 static struct dm_path *hst_select_path(struct path_selector *ps,
432                                        size_t nr_bytes)
433 {
434         struct selector *s = ps->context;
435         struct path_info *pi = NULL, *best = NULL;
436         u64 time_now = sched_clock();
437         struct dm_path *ret = NULL;
438         unsigned long flags;
439
440         spin_lock_irqsave(&s->lock, flags);
441         if (list_empty(&s->valid_paths))
442                 goto out;
443
444         list_for_each_entry(pi, &s->valid_paths, list) {
445                 if (!best || (hst_compare(pi, best, time_now, ps) < 0))
446                         best = pi;
447         }
448
449         if (!best)
450                 goto out;
451
452         /* Move last used path to end (least preferred in case of ties) */
453         list_move_tail(&best->list, &s->valid_paths);
454
455         ret = best->path;
456
457 out:
458         spin_unlock_irqrestore(&s->lock, flags);
459         return ret;
460 }
461
462 static int hst_start_io(struct path_selector *ps, struct dm_path *path,
463                         size_t nr_bytes)
464 {
465         struct path_info *pi = path->pscontext;
466         unsigned long flags;
467
468         spin_lock_irqsave(&pi->lock, flags);
469         pi->outstanding++;
470         spin_unlock_irqrestore(&pi->lock, flags);
471
472         return 0;
473 }
474
475 static u64 path_service_time(struct path_info *pi, u64 start_time)
476 {
477         u64 sched_now = ktime_get_ns();
478
479         /* if a previous disk request has finished after this IO was
480          * sent to the hardware, pretend the submission happened
481          * serially.
482          */
483         if (time_after64(pi->last_finish, start_time))
484                 start_time = pi->last_finish;
485
486         pi->last_finish = sched_now;
487         if (time_before64(sched_now, start_time))
488                 return 0;
489
490         return sched_now - start_time;
491 }
492
493 static int hst_end_io(struct path_selector *ps, struct dm_path *path,
494                       size_t nr_bytes, u64 start_time)
495 {
496         struct path_info *pi = path->pscontext;
497         struct selector *s = ps->context;
498         unsigned long flags;
499         u64 st;
500
501         spin_lock_irqsave(&pi->lock, flags);
502
503         st = path_service_time(pi, start_time);
504         pi->outstanding--;
505         pi->historical_service_time =
506                 fixed_ema(pi->historical_service_time,
507                           min(st * HST_FIXED_1, HST_FIXED_MAX),
508                           hst_weight(ps, st));
509
510         /*
511          * On request end, mark path as fresh. If a path hasn't
512          * finished any requests within the fresh period, the estimated
513          * service time is considered too optimistic and we limit the
514          * maximum requests on that path.
515          */
516         pi->stale_after = pi->last_finish +
517                 (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
518
519         spin_unlock_irqrestore(&pi->lock, flags);
520
521         return 0;
522 }
523
524 static struct path_selector_type hst_ps = {
525         .name           = "historical-service-time",
526         .module         = THIS_MODULE,
527         .table_args     = 1,
528         .info_args      = 3,
529         .create         = hst_create,
530         .destroy        = hst_destroy,
531         .status         = hst_status,
532         .add_path       = hst_add_path,
533         .fail_path      = hst_fail_path,
534         .reinstate_path = hst_reinstate_path,
535         .select_path    = hst_select_path,
536         .start_io       = hst_start_io,
537         .end_io         = hst_end_io,
538 };
539
540 static int __init dm_hst_init(void)
541 {
542         int r = dm_register_path_selector(&hst_ps);
543
544         if (r < 0)
545                 DMERR("register failed %d", r);
546
547         DMINFO("version " HST_VERSION " loaded");
548
549         return r;
550 }
551
552 static void __exit dm_hst_exit(void)
553 {
554         int r = dm_unregister_path_selector(&hst_ps);
555
556         if (r < 0)
557                 DMERR("unregister failed %d", r);
558 }
559
560 module_init(dm_hst_init);
561 module_exit(dm_hst_exit);
562
563 MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
564 MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>");
565 MODULE_LICENSE("GPL");