md: fix md_write_start() deadlock w/o metadata devices
[sfrench/cifs-2.6.git] / drivers / md / dm-zoned-target.c
1 /*
2  * Copyright (C) 2017 Western Digital Corporation or its affiliates.
3  *
4  * This file is released under the GPL.
5  */
6
7 #include "dm-zoned.h"
8
9 #include <linux/module.h>
10
11 #define DM_MSG_PREFIX           "zoned"
12
13 #define DMZ_MIN_BIOS            8192
14
15 /*
16  * Zone BIO context.
17  */
18 struct dmz_bioctx {
19         struct dmz_target       *target;
20         struct dm_zone          *zone;
21         struct bio              *bio;
22         atomic_t                ref;
23         blk_status_t            status;
24 };
25
26 /*
27  * Chunk work descriptor.
28  */
29 struct dm_chunk_work {
30         struct work_struct      work;
31         atomic_t                refcount;
32         struct dmz_target       *target;
33         unsigned int            chunk;
34         struct bio_list         bio_list;
35 };
36
37 /*
38  * Target descriptor.
39  */
40 struct dmz_target {
41         struct dm_dev           *ddev;
42
43         unsigned long           flags;
44
45         /* Zoned block device information */
46         struct dmz_dev          *dev;
47
48         /* For metadata handling */
49         struct dmz_metadata     *metadata;
50
51         /* For reclaim */
52         struct dmz_reclaim      *reclaim;
53
54         /* For chunk work */
55         struct mutex            chunk_lock;
56         struct radix_tree_root  chunk_rxtree;
57         struct workqueue_struct *chunk_wq;
58
59         /* For cloned BIOs to zones */
60         struct bio_set          *bio_set;
61
62         /* For flush */
63         spinlock_t              flush_lock;
64         struct bio_list         flush_list;
65         struct delayed_work     flush_work;
66         struct workqueue_struct *flush_wq;
67 };
68
69 /*
70  * Flush intervals (seconds).
71  */
72 #define DMZ_FLUSH_PERIOD        (10 * HZ)
73
74 /*
75  * Target BIO completion.
76  */
77 static inline void dmz_bio_endio(struct bio *bio, blk_status_t status)
78 {
79         struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
80
81         if (bioctx->status == BLK_STS_OK && status != BLK_STS_OK)
82                 bioctx->status = status;
83         bio_endio(bio);
84 }
85
86 /*
87  * Partial clone read BIO completion callback. This terminates the
88  * target BIO when there are no more references to its context.
89  */
90 static void dmz_read_bio_end_io(struct bio *bio)
91 {
92         struct dmz_bioctx *bioctx = bio->bi_private;
93         blk_status_t status = bio->bi_status;
94
95         bio_put(bio);
96         dmz_bio_endio(bioctx->bio, status);
97 }
98
99 /*
100  * Issue a BIO to a zone. The BIO may only partially process the
101  * original target BIO.
102  */
103 static int dmz_submit_read_bio(struct dmz_target *dmz, struct dm_zone *zone,
104                                struct bio *bio, sector_t chunk_block,
105                                unsigned int nr_blocks)
106 {
107         struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
108         sector_t sector;
109         struct bio *clone;
110
111         /* BIO remap sector */
112         sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
113
114         /* If the read is not partial, there is no need to clone the BIO */
115         if (nr_blocks == dmz_bio_blocks(bio)) {
116                 /* Setup and submit the BIO */
117                 bio->bi_iter.bi_sector = sector;
118                 atomic_inc(&bioctx->ref);
119                 generic_make_request(bio);
120                 return 0;
121         }
122
123         /* Partial BIO: we need to clone the BIO */
124         clone = bio_clone_fast(bio, GFP_NOIO, dmz->bio_set);
125         if (!clone)
126                 return -ENOMEM;
127
128         /* Setup the clone */
129         clone->bi_iter.bi_sector = sector;
130         clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT;
131         clone->bi_end_io = dmz_read_bio_end_io;
132         clone->bi_private = bioctx;
133
134         bio_advance(bio, clone->bi_iter.bi_size);
135
136         /* Submit the clone */
137         atomic_inc(&bioctx->ref);
138         generic_make_request(clone);
139
140         return 0;
141 }
142
143 /*
144  * Zero out pages of discarded blocks accessed by a read BIO.
145  */
146 static void dmz_handle_read_zero(struct dmz_target *dmz, struct bio *bio,
147                                  sector_t chunk_block, unsigned int nr_blocks)
148 {
149         unsigned int size = nr_blocks << DMZ_BLOCK_SHIFT;
150
151         /* Clear nr_blocks */
152         swap(bio->bi_iter.bi_size, size);
153         zero_fill_bio(bio);
154         swap(bio->bi_iter.bi_size, size);
155
156         bio_advance(bio, size);
157 }
158
159 /*
160  * Process a read BIO.
161  */
162 static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
163                            struct bio *bio)
164 {
165         sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio));
166         unsigned int nr_blocks = dmz_bio_blocks(bio);
167         sector_t end_block = chunk_block + nr_blocks;
168         struct dm_zone *rzone, *bzone;
169         int ret;
170
171         /* Read into unmapped chunks need only zeroing the BIO buffer */
172         if (!zone) {
173                 zero_fill_bio(bio);
174                 return 0;
175         }
176
177         dmz_dev_debug(dmz->dev, "READ chunk %llu -> %s zone %u, block %llu, %u blocks",
178                       (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
179                       (dmz_is_rnd(zone) ? "RND" : "SEQ"),
180                       dmz_id(dmz->metadata, zone),
181                       (unsigned long long)chunk_block, nr_blocks);
182
183         /* Check block validity to determine the read location */
184         bzone = zone->bzone;
185         while (chunk_block < end_block) {
186                 nr_blocks = 0;
187                 if (dmz_is_rnd(zone) || chunk_block < zone->wp_block) {
188                         /* Test block validity in the data zone */
189                         ret = dmz_block_valid(dmz->metadata, zone, chunk_block);
190                         if (ret < 0)
191                                 return ret;
192                         if (ret > 0) {
193                                 /* Read data zone blocks */
194                                 nr_blocks = ret;
195                                 rzone = zone;
196                         }
197                 }
198
199                 /*
200                  * No valid blocks found in the data zone.
201                  * Check the buffer zone, if there is one.
202                  */
203                 if (!nr_blocks && bzone) {
204                         ret = dmz_block_valid(dmz->metadata, bzone, chunk_block);
205                         if (ret < 0)
206                                 return ret;
207                         if (ret > 0) {
208                                 /* Read buffer zone blocks */
209                                 nr_blocks = ret;
210                                 rzone = bzone;
211                         }
212                 }
213
214                 if (nr_blocks) {
215                         /* Valid blocks found: read them */
216                         nr_blocks = min_t(unsigned int, nr_blocks, end_block - chunk_block);
217                         ret = dmz_submit_read_bio(dmz, rzone, bio, chunk_block, nr_blocks);
218                         if (ret)
219                                 return ret;
220                         chunk_block += nr_blocks;
221                 } else {
222                         /* No valid block: zeroout the current BIO block */
223                         dmz_handle_read_zero(dmz, bio, chunk_block, 1);
224                         chunk_block++;
225                 }
226         }
227
228         return 0;
229 }
230
231 /*
232  * Issue a write BIO to a zone.
233  */
234 static void dmz_submit_write_bio(struct dmz_target *dmz, struct dm_zone *zone,
235                                  struct bio *bio, sector_t chunk_block,
236                                  unsigned int nr_blocks)
237 {
238         struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
239
240         /* Setup and submit the BIO */
241         bio_set_dev(bio, dmz->dev->bdev);
242         bio->bi_iter.bi_sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
243         atomic_inc(&bioctx->ref);
244         generic_make_request(bio);
245
246         if (dmz_is_seq(zone))
247                 zone->wp_block += nr_blocks;
248 }
249
250 /*
251  * Write blocks directly in a data zone, at the write pointer.
252  * If a buffer zone is assigned, invalidate the blocks written
253  * in place.
254  */
255 static int dmz_handle_direct_write(struct dmz_target *dmz,
256                                    struct dm_zone *zone, struct bio *bio,
257                                    sector_t chunk_block,
258                                    unsigned int nr_blocks)
259 {
260         struct dmz_metadata *zmd = dmz->metadata;
261         struct dm_zone *bzone = zone->bzone;
262         int ret;
263
264         if (dmz_is_readonly(zone))
265                 return -EROFS;
266
267         /* Submit write */
268         dmz_submit_write_bio(dmz, zone, bio, chunk_block, nr_blocks);
269
270         /*
271          * Validate the blocks in the data zone and invalidate
272          * in the buffer zone, if there is one.
273          */
274         ret = dmz_validate_blocks(zmd, zone, chunk_block, nr_blocks);
275         if (ret == 0 && bzone)
276                 ret = dmz_invalidate_blocks(zmd, bzone, chunk_block, nr_blocks);
277
278         return ret;
279 }
280
281 /*
282  * Write blocks in the buffer zone of @zone.
283  * If no buffer zone is assigned yet, get one.
284  * Called with @zone write locked.
285  */
286 static int dmz_handle_buffered_write(struct dmz_target *dmz,
287                                      struct dm_zone *zone, struct bio *bio,
288                                      sector_t chunk_block,
289                                      unsigned int nr_blocks)
290 {
291         struct dmz_metadata *zmd = dmz->metadata;
292         struct dm_zone *bzone;
293         int ret;
294
295         /* Get the buffer zone. One will be allocated if needed */
296         bzone = dmz_get_chunk_buffer(zmd, zone);
297         if (!bzone)
298                 return -ENOSPC;
299
300         if (dmz_is_readonly(bzone))
301                 return -EROFS;
302
303         /* Submit write */
304         dmz_submit_write_bio(dmz, bzone, bio, chunk_block, nr_blocks);
305
306         /*
307          * Validate the blocks in the buffer zone
308          * and invalidate in the data zone.
309          */
310         ret = dmz_validate_blocks(zmd, bzone, chunk_block, nr_blocks);
311         if (ret == 0 && chunk_block < zone->wp_block)
312                 ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
313
314         return ret;
315 }
316
317 /*
318  * Process a write BIO.
319  */
320 static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone,
321                             struct bio *bio)
322 {
323         sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio));
324         unsigned int nr_blocks = dmz_bio_blocks(bio);
325
326         if (!zone)
327                 return -ENOSPC;
328
329         dmz_dev_debug(dmz->dev, "WRITE chunk %llu -> %s zone %u, block %llu, %u blocks",
330                       (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
331                       (dmz_is_rnd(zone) ? "RND" : "SEQ"),
332                       dmz_id(dmz->metadata, zone),
333                       (unsigned long long)chunk_block, nr_blocks);
334
335         if (dmz_is_rnd(zone) || chunk_block == zone->wp_block) {
336                 /*
337                  * zone is a random zone or it is a sequential zone
338                  * and the BIO is aligned to the zone write pointer:
339                  * direct write the zone.
340                  */
341                 return dmz_handle_direct_write(dmz, zone, bio, chunk_block, nr_blocks);
342         }
343
344         /*
345          * This is an unaligned write in a sequential zone:
346          * use buffered write.
347          */
348         return dmz_handle_buffered_write(dmz, zone, bio, chunk_block, nr_blocks);
349 }
350
351 /*
352  * Process a discard BIO.
353  */
354 static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone,
355                               struct bio *bio)
356 {
357         struct dmz_metadata *zmd = dmz->metadata;
358         sector_t block = dmz_bio_block(bio);
359         unsigned int nr_blocks = dmz_bio_blocks(bio);
360         sector_t chunk_block = dmz_chunk_block(dmz->dev, block);
361         int ret = 0;
362
363         /* For unmapped chunks, there is nothing to do */
364         if (!zone)
365                 return 0;
366
367         if (dmz_is_readonly(zone))
368                 return -EROFS;
369
370         dmz_dev_debug(dmz->dev, "DISCARD chunk %llu -> zone %u, block %llu, %u blocks",
371                       (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
372                       dmz_id(zmd, zone),
373                       (unsigned long long)chunk_block, nr_blocks);
374
375         /*
376          * Invalidate blocks in the data zone and its
377          * buffer zone if one is mapped.
378          */
379         if (dmz_is_rnd(zone) || chunk_block < zone->wp_block)
380                 ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
381         if (ret == 0 && zone->bzone)
382                 ret = dmz_invalidate_blocks(zmd, zone->bzone,
383                                             chunk_block, nr_blocks);
384         return ret;
385 }
386
387 /*
388  * Process a BIO.
389  */
390 static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
391                            struct bio *bio)
392 {
393         struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
394         struct dmz_metadata *zmd = dmz->metadata;
395         struct dm_zone *zone;
396         int ret;
397
398         /*
399          * Write may trigger a zone allocation. So make sure the
400          * allocation can succeed.
401          */
402         if (bio_op(bio) == REQ_OP_WRITE)
403                 dmz_schedule_reclaim(dmz->reclaim);
404
405         dmz_lock_metadata(zmd);
406
407         /*
408          * Get the data zone mapping the chunk. There may be no
409          * mapping for read and discard. If a mapping is obtained,
410          + the zone returned will be set to active state.
411          */
412         zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(dmz->dev, bio),
413                                      bio_op(bio));
414         if (IS_ERR(zone)) {
415                 ret = PTR_ERR(zone);
416                 goto out;
417         }
418
419         /* Process the BIO */
420         if (zone) {
421                 dmz_activate_zone(zone);
422                 bioctx->zone = zone;
423         }
424
425         switch (bio_op(bio)) {
426         case REQ_OP_READ:
427                 ret = dmz_handle_read(dmz, zone, bio);
428                 break;
429         case REQ_OP_WRITE:
430                 ret = dmz_handle_write(dmz, zone, bio);
431                 break;
432         case REQ_OP_DISCARD:
433         case REQ_OP_WRITE_ZEROES:
434                 ret = dmz_handle_discard(dmz, zone, bio);
435                 break;
436         default:
437                 dmz_dev_err(dmz->dev, "Unsupported BIO operation 0x%x",
438                             bio_op(bio));
439                 ret = -EIO;
440         }
441
442         /*
443          * Release the chunk mapping. This will check that the mapping
444          * is still valid, that is, that the zone used still has valid blocks.
445          */
446         if (zone)
447                 dmz_put_chunk_mapping(zmd, zone);
448 out:
449         dmz_bio_endio(bio, errno_to_blk_status(ret));
450
451         dmz_unlock_metadata(zmd);
452 }
453
454 /*
455  * Increment a chunk reference counter.
456  */
457 static inline void dmz_get_chunk_work(struct dm_chunk_work *cw)
458 {
459         atomic_inc(&cw->refcount);
460 }
461
462 /*
463  * Decrement a chunk work reference count and
464  * free it if it becomes 0.
465  */
466 static void dmz_put_chunk_work(struct dm_chunk_work *cw)
467 {
468         if (atomic_dec_and_test(&cw->refcount)) {
469                 WARN_ON(!bio_list_empty(&cw->bio_list));
470                 radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk);
471                 kfree(cw);
472         }
473 }
474
475 /*
476  * Chunk BIO work function.
477  */
478 static void dmz_chunk_work(struct work_struct *work)
479 {
480         struct dm_chunk_work *cw = container_of(work, struct dm_chunk_work, work);
481         struct dmz_target *dmz = cw->target;
482         struct bio *bio;
483
484         mutex_lock(&dmz->chunk_lock);
485
486         /* Process the chunk BIOs */
487         while ((bio = bio_list_pop(&cw->bio_list))) {
488                 mutex_unlock(&dmz->chunk_lock);
489                 dmz_handle_bio(dmz, cw, bio);
490                 mutex_lock(&dmz->chunk_lock);
491                 dmz_put_chunk_work(cw);
492         }
493
494         /* Queueing the work incremented the work refcount */
495         dmz_put_chunk_work(cw);
496
497         mutex_unlock(&dmz->chunk_lock);
498 }
499
500 /*
501  * Flush work.
502  */
503 static void dmz_flush_work(struct work_struct *work)
504 {
505         struct dmz_target *dmz = container_of(work, struct dmz_target, flush_work.work);
506         struct bio *bio;
507         int ret;
508
509         /* Flush dirty metadata blocks */
510         ret = dmz_flush_metadata(dmz->metadata);
511
512         /* Process queued flush requests */
513         while (1) {
514                 spin_lock(&dmz->flush_lock);
515                 bio = bio_list_pop(&dmz->flush_list);
516                 spin_unlock(&dmz->flush_lock);
517
518                 if (!bio)
519                         break;
520
521                 dmz_bio_endio(bio, errno_to_blk_status(ret));
522         }
523
524         queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
525 }
526
527 /*
528  * Get a chunk work and start it to process a new BIO.
529  * If the BIO chunk has no work yet, create one.
530  */
531 static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
532 {
533         unsigned int chunk = dmz_bio_chunk(dmz->dev, bio);
534         struct dm_chunk_work *cw;
535
536         mutex_lock(&dmz->chunk_lock);
537
538         /* Get the BIO chunk work. If one is not active yet, create one */
539         cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk);
540         if (!cw) {
541                 int ret;
542
543                 /* Create a new chunk work */
544                 cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO);
545                 if (!cw)
546                         goto out;
547
548                 INIT_WORK(&cw->work, dmz_chunk_work);
549                 atomic_set(&cw->refcount, 0);
550                 cw->target = dmz;
551                 cw->chunk = chunk;
552                 bio_list_init(&cw->bio_list);
553
554                 ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw);
555                 if (unlikely(ret)) {
556                         kfree(cw);
557                         cw = NULL;
558                         goto out;
559                 }
560         }
561
562         bio_list_add(&cw->bio_list, bio);
563         dmz_get_chunk_work(cw);
564
565         if (queue_work(dmz->chunk_wq, &cw->work))
566                 dmz_get_chunk_work(cw);
567 out:
568         mutex_unlock(&dmz->chunk_lock);
569 }
570
571 /*
572  * Process a new BIO.
573  */
574 static int dmz_map(struct dm_target *ti, struct bio *bio)
575 {
576         struct dmz_target *dmz = ti->private;
577         struct dmz_dev *dev = dmz->dev;
578         struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
579         sector_t sector = bio->bi_iter.bi_sector;
580         unsigned int nr_sectors = bio_sectors(bio);
581         sector_t chunk_sector;
582
583         dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
584                       bio_op(bio), (unsigned long long)sector, nr_sectors,
585                       (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
586                       (unsigned long long)dmz_chunk_block(dmz->dev, dmz_bio_block(bio)),
587                       (unsigned int)dmz_bio_blocks(bio));
588
589         bio_set_dev(bio, dev->bdev);
590
591         if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE)
592                 return DM_MAPIO_REMAPPED;
593
594         /* The BIO should be block aligned */
595         if ((nr_sectors & DMZ_BLOCK_SECTORS_MASK) || (sector & DMZ_BLOCK_SECTORS_MASK))
596                 return DM_MAPIO_KILL;
597
598         /* Initialize the BIO context */
599         bioctx->target = dmz;
600         bioctx->zone = NULL;
601         bioctx->bio = bio;
602         atomic_set(&bioctx->ref, 1);
603         bioctx->status = BLK_STS_OK;
604
605         /* Set the BIO pending in the flush list */
606         if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) {
607                 spin_lock(&dmz->flush_lock);
608                 bio_list_add(&dmz->flush_list, bio);
609                 spin_unlock(&dmz->flush_lock);
610                 mod_delayed_work(dmz->flush_wq, &dmz->flush_work, 0);
611                 return DM_MAPIO_SUBMITTED;
612         }
613
614         /* Split zone BIOs to fit entirely into a zone */
615         chunk_sector = sector & (dev->zone_nr_sectors - 1);
616         if (chunk_sector + nr_sectors > dev->zone_nr_sectors)
617                 dm_accept_partial_bio(bio, dev->zone_nr_sectors - chunk_sector);
618
619         /* Now ready to handle this BIO */
620         dmz_reclaim_bio_acc(dmz->reclaim);
621         dmz_queue_chunk_work(dmz, bio);
622
623         return DM_MAPIO_SUBMITTED;
624 }
625
626 /*
627  * Completed target BIO processing.
628  */
629 static int dmz_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error)
630 {
631         struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
632
633         if (bioctx->status == BLK_STS_OK && *error)
634                 bioctx->status = *error;
635
636         if (!atomic_dec_and_test(&bioctx->ref))
637                 return DM_ENDIO_INCOMPLETE;
638
639         /* Done */
640         bio->bi_status = bioctx->status;
641
642         if (bioctx->zone) {
643                 struct dm_zone *zone = bioctx->zone;
644
645                 if (*error && bio_op(bio) == REQ_OP_WRITE) {
646                         if (dmz_is_seq(zone))
647                                 set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
648                 }
649                 dmz_deactivate_zone(zone);
650         }
651
652         return DM_ENDIO_DONE;
653 }
654
655 /*
656  * Get zoned device information.
657  */
658 static int dmz_get_zoned_device(struct dm_target *ti, char *path)
659 {
660         struct dmz_target *dmz = ti->private;
661         struct request_queue *q;
662         struct dmz_dev *dev;
663         sector_t aligned_capacity;
664         int ret;
665
666         /* Get the target device */
667         ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &dmz->ddev);
668         if (ret) {
669                 ti->error = "Get target device failed";
670                 dmz->ddev = NULL;
671                 return ret;
672         }
673
674         dev = kzalloc(sizeof(struct dmz_dev), GFP_KERNEL);
675         if (!dev) {
676                 ret = -ENOMEM;
677                 goto err;
678         }
679
680         dev->bdev = dmz->ddev->bdev;
681         (void)bdevname(dev->bdev, dev->name);
682
683         if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) {
684                 ti->error = "Not a zoned block device";
685                 ret = -EINVAL;
686                 goto err;
687         }
688
689         q = bdev_get_queue(dev->bdev);
690         dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
691         aligned_capacity = dev->capacity & ~(blk_queue_zone_sectors(q) - 1);
692         if (ti->begin ||
693             ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) {
694                 ti->error = "Partial mapping not supported";
695                 ret = -EINVAL;
696                 goto err;
697         }
698
699         dev->zone_nr_sectors = blk_queue_zone_sectors(q);
700         dev->zone_nr_sectors_shift = ilog2(dev->zone_nr_sectors);
701
702         dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors);
703         dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks);
704
705         dev->nr_zones = (dev->capacity + dev->zone_nr_sectors - 1)
706                 >> dev->zone_nr_sectors_shift;
707
708         dmz->dev = dev;
709
710         return 0;
711 err:
712         dm_put_device(ti, dmz->ddev);
713         kfree(dev);
714
715         return ret;
716 }
717
718 /*
719  * Cleanup zoned device information.
720  */
721 static void dmz_put_zoned_device(struct dm_target *ti)
722 {
723         struct dmz_target *dmz = ti->private;
724
725         dm_put_device(ti, dmz->ddev);
726         kfree(dmz->dev);
727         dmz->dev = NULL;
728 }
729
730 /*
731  * Setup target.
732  */
733 static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
734 {
735         struct dmz_target *dmz;
736         struct dmz_dev *dev;
737         int ret;
738
739         /* Check arguments */
740         if (argc != 1) {
741                 ti->error = "Invalid argument count";
742                 return -EINVAL;
743         }
744
745         /* Allocate and initialize the target descriptor */
746         dmz = kzalloc(sizeof(struct dmz_target), GFP_KERNEL);
747         if (!dmz) {
748                 ti->error = "Unable to allocate the zoned target descriptor";
749                 return -ENOMEM;
750         }
751         ti->private = dmz;
752
753         /* Get the target zoned block device */
754         ret = dmz_get_zoned_device(ti, argv[0]);
755         if (ret) {
756                 dmz->ddev = NULL;
757                 goto err;
758         }
759
760         /* Initialize metadata */
761         dev = dmz->dev;
762         ret = dmz_ctr_metadata(dev, &dmz->metadata);
763         if (ret) {
764                 ti->error = "Metadata initialization failed";
765                 goto err_dev;
766         }
767
768         /* Set target (no write same support) */
769         ti->max_io_len = dev->zone_nr_sectors << 9;
770         ti->num_flush_bios = 1;
771         ti->num_discard_bios = 1;
772         ti->num_write_zeroes_bios = 1;
773         ti->per_io_data_size = sizeof(struct dmz_bioctx);
774         ti->flush_supported = true;
775         ti->discards_supported = true;
776         ti->split_discard_bios = true;
777
778         /* The exposed capacity is the number of chunks that can be mapped */
779         ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift;
780
781         /* Zone BIO */
782         dmz->bio_set = bioset_create(DMZ_MIN_BIOS, 0, 0);
783         if (!dmz->bio_set) {
784                 ti->error = "Create BIO set failed";
785                 ret = -ENOMEM;
786                 goto err_meta;
787         }
788
789         /* Chunk BIO work */
790         mutex_init(&dmz->chunk_lock);
791         INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_KERNEL);
792         dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND,
793                                         0, dev->name);
794         if (!dmz->chunk_wq) {
795                 ti->error = "Create chunk workqueue failed";
796                 ret = -ENOMEM;
797                 goto err_bio;
798         }
799
800         /* Flush work */
801         spin_lock_init(&dmz->flush_lock);
802         bio_list_init(&dmz->flush_list);
803         INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work);
804         dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM,
805                                                 dev->name);
806         if (!dmz->flush_wq) {
807                 ti->error = "Create flush workqueue failed";
808                 ret = -ENOMEM;
809                 goto err_cwq;
810         }
811         mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
812
813         /* Initialize reclaim */
814         ret = dmz_ctr_reclaim(dev, dmz->metadata, &dmz->reclaim);
815         if (ret) {
816                 ti->error = "Zone reclaim initialization failed";
817                 goto err_fwq;
818         }
819
820         dmz_dev_info(dev, "Target device: %llu 512-byte logical sectors (%llu blocks)",
821                      (unsigned long long)ti->len,
822                      (unsigned long long)dmz_sect2blk(ti->len));
823
824         return 0;
825 err_fwq:
826         destroy_workqueue(dmz->flush_wq);
827 err_cwq:
828         destroy_workqueue(dmz->chunk_wq);
829 err_bio:
830         mutex_destroy(&dmz->chunk_lock);
831         bioset_free(dmz->bio_set);
832 err_meta:
833         dmz_dtr_metadata(dmz->metadata);
834 err_dev:
835         dmz_put_zoned_device(ti);
836 err:
837         kfree(dmz);
838
839         return ret;
840 }
841
842 /*
843  * Cleanup target.
844  */
845 static void dmz_dtr(struct dm_target *ti)
846 {
847         struct dmz_target *dmz = ti->private;
848
849         flush_workqueue(dmz->chunk_wq);
850         destroy_workqueue(dmz->chunk_wq);
851
852         dmz_dtr_reclaim(dmz->reclaim);
853
854         cancel_delayed_work_sync(&dmz->flush_work);
855         destroy_workqueue(dmz->flush_wq);
856
857         (void) dmz_flush_metadata(dmz->metadata);
858
859         dmz_dtr_metadata(dmz->metadata);
860
861         bioset_free(dmz->bio_set);
862
863         dmz_put_zoned_device(ti);
864
865         mutex_destroy(&dmz->chunk_lock);
866
867         kfree(dmz);
868 }
869
870 /*
871  * Setup target request queue limits.
872  */
873 static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
874 {
875         struct dmz_target *dmz = ti->private;
876         unsigned int chunk_sectors = dmz->dev->zone_nr_sectors;
877
878         limits->logical_block_size = DMZ_BLOCK_SIZE;
879         limits->physical_block_size = DMZ_BLOCK_SIZE;
880
881         blk_limits_io_min(limits, DMZ_BLOCK_SIZE);
882         blk_limits_io_opt(limits, DMZ_BLOCK_SIZE);
883
884         limits->discard_alignment = DMZ_BLOCK_SIZE;
885         limits->discard_granularity = DMZ_BLOCK_SIZE;
886         limits->max_discard_sectors = chunk_sectors;
887         limits->max_hw_discard_sectors = chunk_sectors;
888         limits->max_write_zeroes_sectors = chunk_sectors;
889
890         /* FS hint to try to align to the device zone size */
891         limits->chunk_sectors = chunk_sectors;
892         limits->max_sectors = chunk_sectors;
893
894         /* We are exposing a drive-managed zoned block device */
895         limits->zoned = BLK_ZONED_NONE;
896 }
897
898 /*
899  * Pass on ioctl to the backend device.
900  */
901 static int dmz_prepare_ioctl(struct dm_target *ti,
902                              struct block_device **bdev, fmode_t *mode)
903 {
904         struct dmz_target *dmz = ti->private;
905
906         *bdev = dmz->dev->bdev;
907
908         return 0;
909 }
910
911 /*
912  * Stop works on suspend.
913  */
914 static void dmz_suspend(struct dm_target *ti)
915 {
916         struct dmz_target *dmz = ti->private;
917
918         flush_workqueue(dmz->chunk_wq);
919         dmz_suspend_reclaim(dmz->reclaim);
920         cancel_delayed_work_sync(&dmz->flush_work);
921 }
922
923 /*
924  * Restart works on resume or if suspend failed.
925  */
926 static void dmz_resume(struct dm_target *ti)
927 {
928         struct dmz_target *dmz = ti->private;
929
930         queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
931         dmz_resume_reclaim(dmz->reclaim);
932 }
933
934 static int dmz_iterate_devices(struct dm_target *ti,
935                                iterate_devices_callout_fn fn, void *data)
936 {
937         struct dmz_target *dmz = ti->private;
938         struct dmz_dev *dev = dmz->dev;
939         sector_t capacity = dev->capacity & ~(dev->zone_nr_sectors - 1);
940
941         return fn(ti, dmz->ddev, 0, capacity, data);
942 }
943
944 static struct target_type dmz_type = {
945         .name            = "zoned",
946         .version         = {1, 0, 0},
947         .features        = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM,
948         .module          = THIS_MODULE,
949         .ctr             = dmz_ctr,
950         .dtr             = dmz_dtr,
951         .map             = dmz_map,
952         .end_io          = dmz_end_io,
953         .io_hints        = dmz_io_hints,
954         .prepare_ioctl   = dmz_prepare_ioctl,
955         .postsuspend     = dmz_suspend,
956         .resume          = dmz_resume,
957         .iterate_devices = dmz_iterate_devices,
958 };
959
960 static int __init dmz_init(void)
961 {
962         return dm_register_target(&dmz_type);
963 }
964
965 static void __exit dmz_exit(void)
966 {
967         dm_unregister_target(&dmz_type);
968 }
969
970 module_init(dmz_init);
971 module_exit(dmz_exit);
972
973 MODULE_DESCRIPTION(DM_NAME " target for zoned block devices");
974 MODULE_AUTHOR("Damien Le Moal <damien.lemoal@wdc.com>");
975 MODULE_LICENSE("GPL");