Merge tag 's390-5.1-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux
[sfrench/cifs-2.6.git] / drivers / md / dm-switch.c
1 /*
2  * Copyright (C) 2010-2012 by Dell Inc.  All rights reserved.
3  * Copyright (C) 2011-2013 Red Hat, Inc.
4  *
5  * This file is released under the GPL.
6  *
7  * dm-switch is a device-mapper target that maps IO to underlying block
8  * devices efficiently when there are a large number of fixed-sized
9  * address regions but there is no simple pattern to allow for a compact
10  * mapping representation such as dm-stripe.
11  */
12
13 #include <linux/device-mapper.h>
14
15 #include <linux/module.h>
16 #include <linux/init.h>
17 #include <linux/vmalloc.h>
18
19 #define DM_MSG_PREFIX "switch"
20
21 /*
22  * One region_table_slot_t holds <region_entries_per_slot> region table
23  * entries each of which is <region_table_entry_bits> in size.
24  */
25 typedef unsigned long region_table_slot_t;
26
27 /*
28  * A device with the offset to its start sector.
29  */
30 struct switch_path {
31         struct dm_dev *dmdev;
32         sector_t start;
33 };
34
35 /*
36  * Context block for a dm switch device.
37  */
38 struct switch_ctx {
39         struct dm_target *ti;
40
41         unsigned nr_paths;              /* Number of paths in path_list. */
42
43         unsigned region_size;           /* Region size in 512-byte sectors */
44         unsigned long nr_regions;       /* Number of regions making up the device */
45         signed char region_size_bits;   /* log2 of region_size or -1 */
46
47         unsigned char region_table_entry_bits;  /* Number of bits in one region table entry */
48         unsigned char region_entries_per_slot;  /* Number of entries in one region table slot */
49         signed char region_entries_per_slot_bits;       /* log2 of region_entries_per_slot or -1 */
50
51         region_table_slot_t *region_table;      /* Region table */
52
53         /*
54          * Array of dm devices to switch between.
55          */
56         struct switch_path path_list[0];
57 };
58
59 static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
60                                            unsigned region_size)
61 {
62         struct switch_ctx *sctx;
63
64         sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path),
65                        GFP_KERNEL);
66         if (!sctx)
67                 return NULL;
68
69         sctx->ti = ti;
70         sctx->region_size = region_size;
71
72         ti->private = sctx;
73
74         return sctx;
75 }
76
77 static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
78 {
79         struct switch_ctx *sctx = ti->private;
80         sector_t nr_regions = ti->len;
81         sector_t nr_slots;
82
83         if (!(sctx->region_size & (sctx->region_size - 1)))
84                 sctx->region_size_bits = __ffs(sctx->region_size);
85         else
86                 sctx->region_size_bits = -1;
87
88         sctx->region_table_entry_bits = 1;
89         while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
90                (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
91                 sctx->region_table_entry_bits++;
92
93         sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
94         if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
95                 sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
96         else
97                 sctx->region_entries_per_slot_bits = -1;
98
99         if (sector_div(nr_regions, sctx->region_size))
100                 nr_regions++;
101
102         if (nr_regions >= ULONG_MAX) {
103                 ti->error = "Region table too large";
104                 return -EINVAL;
105         }
106         sctx->nr_regions = nr_regions;
107
108         nr_slots = nr_regions;
109         if (sector_div(nr_slots, sctx->region_entries_per_slot))
110                 nr_slots++;
111
112         if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
113                 ti->error = "Region table too large";
114                 return -EINVAL;
115         }
116
117         sctx->region_table = vmalloc(array_size(nr_slots,
118                                                 sizeof(region_table_slot_t)));
119         if (!sctx->region_table) {
120                 ti->error = "Cannot allocate region table";
121                 return -ENOMEM;
122         }
123
124         return 0;
125 }
126
127 static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
128                                 unsigned long *region_index, unsigned *bit)
129 {
130         if (sctx->region_entries_per_slot_bits >= 0) {
131                 *region_index = region_nr >> sctx->region_entries_per_slot_bits;
132                 *bit = region_nr & (sctx->region_entries_per_slot - 1);
133         } else {
134                 *region_index = region_nr / sctx->region_entries_per_slot;
135                 *bit = region_nr % sctx->region_entries_per_slot;
136         }
137
138         *bit *= sctx->region_table_entry_bits;
139 }
140
141 static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
142 {
143         unsigned long region_index;
144         unsigned bit;
145
146         switch_get_position(sctx, region_nr, &region_index, &bit);
147
148         return (READ_ONCE(sctx->region_table[region_index]) >> bit) &
149                 ((1 << sctx->region_table_entry_bits) - 1);
150 }
151
152 /*
153  * Find which path to use at given offset.
154  */
155 static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
156 {
157         unsigned path_nr;
158         sector_t p;
159
160         p = offset;
161         if (sctx->region_size_bits >= 0)
162                 p >>= sctx->region_size_bits;
163         else
164                 sector_div(p, sctx->region_size);
165
166         path_nr = switch_region_table_read(sctx, p);
167
168         /* This can only happen if the processor uses non-atomic stores. */
169         if (unlikely(path_nr >= sctx->nr_paths))
170                 path_nr = 0;
171
172         return path_nr;
173 }
174
175 static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
176                                       unsigned value)
177 {
178         unsigned long region_index;
179         unsigned bit;
180         region_table_slot_t pte;
181
182         switch_get_position(sctx, region_nr, &region_index, &bit);
183
184         pte = sctx->region_table[region_index];
185         pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
186         pte |= (region_table_slot_t)value << bit;
187         sctx->region_table[region_index] = pte;
188 }
189
190 /*
191  * Fill the region table with an initial round robin pattern.
192  */
193 static void initialise_region_table(struct switch_ctx *sctx)
194 {
195         unsigned path_nr = 0;
196         unsigned long region_nr;
197
198         for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
199                 switch_region_table_write(sctx, region_nr, path_nr);
200                 if (++path_nr >= sctx->nr_paths)
201                         path_nr = 0;
202         }
203 }
204
205 static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
206 {
207         struct switch_ctx *sctx = ti->private;
208         unsigned long long start;
209         int r;
210
211         r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
212                           &sctx->path_list[sctx->nr_paths].dmdev);
213         if (r) {
214                 ti->error = "Device lookup failed";
215                 return r;
216         }
217
218         if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
219                 ti->error = "Invalid device starting offset";
220                 dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
221                 return -EINVAL;
222         }
223
224         sctx->path_list[sctx->nr_paths].start = start;
225
226         sctx->nr_paths++;
227
228         return 0;
229 }
230
231 /*
232  * Destructor: Don't free the dm_target, just the ti->private data (if any).
233  */
234 static void switch_dtr(struct dm_target *ti)
235 {
236         struct switch_ctx *sctx = ti->private;
237
238         while (sctx->nr_paths--)
239                 dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
240
241         vfree(sctx->region_table);
242         kfree(sctx);
243 }
244
245 /*
246  * Constructor arguments:
247  *   <num_paths> <region_size> <num_optional_args> [<optional_args>...]
248  *   [<dev_path> <offset>]+
249  *
250  * Optional args are to allow for future extension: currently this
251  * parameter must be 0.
252  */
253 static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
254 {
255         static const struct dm_arg _args[] = {
256                 {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
257                 {1, UINT_MAX, "Invalid region size"},
258                 {0, 0, "Invalid number of optional args"},
259         };
260
261         struct switch_ctx *sctx;
262         struct dm_arg_set as;
263         unsigned nr_paths, region_size, nr_optional_args;
264         int r;
265
266         as.argc = argc;
267         as.argv = argv;
268
269         r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
270         if (r)
271                 return -EINVAL;
272
273         r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
274         if (r)
275                 return r;
276
277         r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
278         if (r)
279                 return r;
280         /* parse optional arguments here, if we add any */
281
282         if (as.argc != nr_paths * 2) {
283                 ti->error = "Incorrect number of path arguments";
284                 return -EINVAL;
285         }
286
287         sctx = alloc_switch_ctx(ti, nr_paths, region_size);
288         if (!sctx) {
289                 ti->error = "Cannot allocate redirection context";
290                 return -ENOMEM;
291         }
292
293         r = dm_set_target_max_io_len(ti, region_size);
294         if (r)
295                 goto error;
296
297         while (as.argc) {
298                 r = parse_path(&as, ti);
299                 if (r)
300                         goto error;
301         }
302
303         r = alloc_region_table(ti, nr_paths);
304         if (r)
305                 goto error;
306
307         initialise_region_table(sctx);
308
309         /* For UNMAP, sending the request down any path is sufficient */
310         ti->num_discard_bios = 1;
311
312         return 0;
313
314 error:
315         switch_dtr(ti);
316
317         return r;
318 }
319
320 static int switch_map(struct dm_target *ti, struct bio *bio)
321 {
322         struct switch_ctx *sctx = ti->private;
323         sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
324         unsigned path_nr = switch_get_path_nr(sctx, offset);
325
326         bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev);
327         bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
328
329         return DM_MAPIO_REMAPPED;
330 }
331
332 /*
333  * We need to parse hex numbers in the message as quickly as possible.
334  *
335  * This table-based hex parser improves performance.
336  * It improves a time to load 1000000 entries compared to the condition-based
337  * parser.
338  *              table-based parser      condition-based parser
339  * PA-RISC      0.29s                   0.31s
340  * Opteron      0.0495s                 0.0498s
341  */
342 static const unsigned char hex_table[256] = {
343 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
344 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
345 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
346 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
347 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
348 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
349 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
350 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
351 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
352 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
353 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
354 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
355 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
356 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
357 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
358 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
359 };
360
361 static __always_inline unsigned long parse_hex(const char **string)
362 {
363         unsigned char d;
364         unsigned long r = 0;
365
366         while ((d = hex_table[(unsigned char)**string]) < 16) {
367                 r = (r << 4) | d;
368                 (*string)++;
369         }
370
371         return r;
372 }
373
374 static int process_set_region_mappings(struct switch_ctx *sctx,
375                                        unsigned argc, char **argv)
376 {
377         unsigned i;
378         unsigned long region_index = 0;
379
380         for (i = 1; i < argc; i++) {
381                 unsigned long path_nr;
382                 const char *string = argv[i];
383
384                 if ((*string & 0xdf) == 'R') {
385                         unsigned long cycle_length, num_write;
386
387                         string++;
388                         if (unlikely(*string == ',')) {
389                                 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
390                                 return -EINVAL;
391                         }
392                         cycle_length = parse_hex(&string);
393                         if (unlikely(*string != ',')) {
394                                 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
395                                 return -EINVAL;
396                         }
397                         string++;
398                         if (unlikely(!*string)) {
399                                 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
400                                 return -EINVAL;
401                         }
402                         num_write = parse_hex(&string);
403                         if (unlikely(*string)) {
404                                 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
405                                 return -EINVAL;
406                         }
407
408                         if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
409                                 DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
410                                        cycle_length - 1, region_index);
411                                 return -EINVAL;
412                         }
413                         if (unlikely(region_index + num_write < region_index) ||
414                             unlikely(region_index + num_write >= sctx->nr_regions)) {
415                                 DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
416                                        region_index, num_write, sctx->nr_regions);
417                                 return -EINVAL;
418                         }
419
420                         while (num_write--) {
421                                 region_index++;
422                                 path_nr = switch_region_table_read(sctx, region_index - cycle_length);
423                                 switch_region_table_write(sctx, region_index, path_nr);
424                         }
425
426                         continue;
427                 }
428
429                 if (*string == ':')
430                         region_index++;
431                 else {
432                         region_index = parse_hex(&string);
433                         if (unlikely(*string != ':')) {
434                                 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
435                                 return -EINVAL;
436                         }
437                 }
438
439                 string++;
440                 if (unlikely(!*string)) {
441                         DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
442                         return -EINVAL;
443                 }
444
445                 path_nr = parse_hex(&string);
446                 if (unlikely(*string)) {
447                         DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
448                         return -EINVAL;
449                 }
450                 if (unlikely(region_index >= sctx->nr_regions)) {
451                         DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
452                         return -EINVAL;
453                 }
454                 if (unlikely(path_nr >= sctx->nr_paths)) {
455                         DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
456                         return -EINVAL;
457                 }
458
459                 switch_region_table_write(sctx, region_index, path_nr);
460         }
461
462         return 0;
463 }
464
465 /*
466  * Messages are processed one-at-a-time.
467  *
468  * Only set_region_mappings is supported.
469  */
470 static int switch_message(struct dm_target *ti, unsigned argc, char **argv,
471                           char *result, unsigned maxlen)
472 {
473         static DEFINE_MUTEX(message_mutex);
474
475         struct switch_ctx *sctx = ti->private;
476         int r = -EINVAL;
477
478         mutex_lock(&message_mutex);
479
480         if (!strcasecmp(argv[0], "set_region_mappings"))
481                 r = process_set_region_mappings(sctx, argc, argv);
482         else
483                 DMWARN("Unrecognised message received.");
484
485         mutex_unlock(&message_mutex);
486
487         return r;
488 }
489
490 static void switch_status(struct dm_target *ti, status_type_t type,
491                           unsigned status_flags, char *result, unsigned maxlen)
492 {
493         struct switch_ctx *sctx = ti->private;
494         unsigned sz = 0;
495         int path_nr;
496
497         switch (type) {
498         case STATUSTYPE_INFO:
499                 result[0] = '\0';
500                 break;
501
502         case STATUSTYPE_TABLE:
503                 DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
504                 for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
505                         DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
506                                (unsigned long long)sctx->path_list[path_nr].start);
507                 break;
508         }
509 }
510
511 /*
512  * Switch ioctl:
513  *
514  * Passthrough all ioctls to the path for sector 0
515  */
516 static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
517 {
518         struct switch_ctx *sctx = ti->private;
519         unsigned path_nr;
520
521         path_nr = switch_get_path_nr(sctx, 0);
522
523         *bdev = sctx->path_list[path_nr].dmdev->bdev;
524
525         /*
526          * Only pass ioctls through if the device sizes match exactly.
527          */
528         if (ti->len + sctx->path_list[path_nr].start !=
529             i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
530                 return 1;
531         return 0;
532 }
533
534 static int switch_iterate_devices(struct dm_target *ti,
535                                   iterate_devices_callout_fn fn, void *data)
536 {
537         struct switch_ctx *sctx = ti->private;
538         int path_nr;
539         int r;
540
541         for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
542                 r = fn(ti, sctx->path_list[path_nr].dmdev,
543                          sctx->path_list[path_nr].start, ti->len, data);
544                 if (r)
545                         return r;
546         }
547
548         return 0;
549 }
550
551 static struct target_type switch_target = {
552         .name = "switch",
553         .version = {1, 1, 0},
554         .module = THIS_MODULE,
555         .ctr = switch_ctr,
556         .dtr = switch_dtr,
557         .map = switch_map,
558         .message = switch_message,
559         .status = switch_status,
560         .prepare_ioctl = switch_prepare_ioctl,
561         .iterate_devices = switch_iterate_devices,
562 };
563
564 static int __init dm_switch_init(void)
565 {
566         int r;
567
568         r = dm_register_target(&switch_target);
569         if (r < 0)
570                 DMERR("dm_register_target() failed %d", r);
571
572         return r;
573 }
574
575 static void __exit dm_switch_exit(void)
576 {
577         dm_unregister_target(&switch_target);
578 }
579
580 module_init(dm_switch_init);
581 module_exit(dm_switch_exit);
582
583 MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
584 MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
585 MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
586 MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
587 MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
588 MODULE_LICENSE("GPL");