drivers/md/dm-thin.c

   1 /*
   2  * Copyright (C) 2011-2012 Red Hat UK.
   3  *
   4  * This file is released under the GPL.
   5  */
   6
   7 #include "dm-thin-metadata.h"
   8 #include "dm-bio-prison-v1.h"
   9 #include "dm.h"
  10
  11 #include <linux/device-mapper.h>
  12 #include <linux/dm-io.h>
  13 #include <linux/dm-kcopyd.h>
  14 #include <linux/jiffies.h>
  15 #include <linux/log2.h>
  16 #include <linux/list.h>
  17 #include <linux/rculist.h>
  18 #include <linux/init.h>
  19 #include <linux/module.h>
  20 #include <linux/slab.h>
  21 #include <linux/vmalloc.h>
  22 #include <linux/sort.h>
  23 #include <linux/rbtree.h>
  24
  25 #define DM_MSG_PREFIX   "thin"
  26
  27 /*
  28  * Tunable constants
  29  */
  30 #define ENDIO_HOOK_POOL_SIZE 1024
  31 #define MAPPING_POOL_SIZE 1024
  32 #define COMMIT_PERIOD HZ
  33 #define NO_SPACE_TIMEOUT_SECS 60
  34
  35 static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
  36
  37 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
  38                 "A percentage of time allocated for copy on write");
  39
  40 /*
  41  * The block size of the device holding pool data must be
  42  * between 64KB and 1GB.
  43  */
  44 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
  45 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
  46
  47 /*
  48  * Device id is restricted to 24 bits.
  49  */
  50 #define MAX_DEV_ID ((1 << 24) - 1)
  51
  52 /*
  53  * How do we handle breaking sharing of data blocks?
  54  * =================================================
  55  *
  56  * We use a standard copy-on-write btree to store the mappings for the
  57  * devices (note I'm talking about copy-on-write of the metadata here, not
  58  * the data).  When you take an internal snapshot you clone the root node
  59  * of the origin btree.  After this there is no concept of an origin or a
  60  * snapshot.  They are just two device trees that happen to point to the
  61  * same data blocks.
  62  *
  63  * When we get a write in we decide if it's to a shared data block using
  64  * some timestamp magic.  If it is, we have to break sharing.
  65  *
  66  * Let's say we write to a shared block in what was the origin.  The
  67  * steps are:
  68  *
  69  * i) plug io further to this physical block. (see bio_prison code).
  70  *
  71  * ii) quiesce any read io to that shared data block.  Obviously
  72  * including all devices that share this block.  (see dm_deferred_set code)
  73  *
  74  * iii) copy the data block to a newly allocate block.  This step can be
  75  * missed out if the io covers the block. (schedule_copy).
  76  *
  77  * iv) insert the new mapping into the origin's btree
  78  * (process_prepared_mapping).  This act of inserting breaks some
  79  * sharing of btree nodes between the two devices.  Breaking sharing only
  80  * effects the btree of that specific device.  Btrees for the other
  81  * devices that share the block never change.  The btree for the origin
  82  * device as it was after the last commit is untouched, ie. we're using
  83  * persistent data structures in the functional programming sense.
  84  *
  85  * v) unplug io to this physical block, including the io that triggered
  86  * the breaking of sharing.
  87  *
  88  * Steps (ii) and (iii) occur in parallel.
  89  *
  90  * The metadata _doesn't_ need to be committed before the io continues.  We
  91  * get away with this because the io is always written to a _new_ block.
  92  * If there's a crash, then:
  93  *
  94  * - The origin mapping will point to the old origin block (the shared
  95  * one).  This will contain the data as it was before the io that triggered
  96  * the breaking of sharing came in.
  97  *
  98  * - The snap mapping still points to the old block.  As it would after
  99  * the commit.
 100  *
 101  * The downside of this scheme is the timestamp magic isn't perfect, and
 102  * will continue to think that data block in the snapshot device is shared
 103  * even after the write to the origin has broken sharing.  I suspect data
 104  * blocks will typically be shared by many different devices, so we're
 105  * breaking sharing n + 1 times, rather than n, where n is the number of
 106  * devices that reference this data block.  At the moment I think the
 107  * benefits far, far outweigh the disadvantages.
 108  */
 109
 110 /*----------------------------------------------------------------*/
 111
 112 /*
 113  * Key building.
 114  */
 115 enum lock_space {
 116         VIRTUAL,
 117         PHYSICAL
 118 };
 119
 120 static void build_key(struct dm_thin_device *td, enum lock_space ls,
 121                       dm_block_t b, dm_block_t e, struct dm_cell_key *key)
 122 {
 123         key->virtual = (ls == VIRTUAL);
 124         key->dev = dm_thin_dev_id(td);
 125         key->block_begin = b;
 126         key->block_end = e;
 127 }
 128
 129 static void build_data_key(struct dm_thin_device *td, dm_block_t b,
 130                            struct dm_cell_key *key)
 131 {
 132         build_key(td, PHYSICAL, b, b + 1llu, key);
 133 }
 134
 135 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 136                               struct dm_cell_key *key)
 137 {
 138         build_key(td, VIRTUAL, b, b + 1llu, key);
 139 }
 140
 141 /*----------------------------------------------------------------*/
 142
 143 #define THROTTLE_THRESHOLD (1 * HZ)
 144
 145 struct throttle {
 146         struct rw_semaphore lock;
 147         unsigned long threshold;
 148         bool throttle_applied;
 149 };
 150
 151 static void throttle_init(struct throttle *t)
 152 {
 153         init_rwsem(&t->lock);
 154         t->throttle_applied = false;
 155 }
 156
 157 static void throttle_work_start(struct throttle *t)
 158 {
 159         t->threshold = jiffies + THROTTLE_THRESHOLD;
 160 }
 161
 162 static void throttle_work_update(struct throttle *t)
 163 {
 164         if (!t->throttle_applied && jiffies > t->threshold) {
 165                 down_write(&t->lock);
 166                 t->throttle_applied = true;
 167         }
 168 }
 169
 170 static void throttle_work_complete(struct throttle *t)
 171 {
 172         if (t->throttle_applied) {
 173                 t->throttle_applied = false;
 174                 up_write(&t->lock);
 175         }
 176 }
 177
 178 static void throttle_lock(struct throttle *t)
 179 {
 180         down_read(&t->lock);
 181 }
 182
 183 static void throttle_unlock(struct throttle *t)
 184 {
 185         up_read(&t->lock);
 186 }
 187
 188 /*----------------------------------------------------------------*/
 189
 190 /*
 191  * A pool device ties together a metadata device and a data device.  It
 192  * also provides the interface for creating and destroying internal
 193  * devices.
 194  */
 195 struct dm_thin_new_mapping;
 196
 197 /*
 198  * The pool runs in various modes.  Ordered in degraded order for comparisons.
 199  */
 200 enum pool_mode {
 201         PM_WRITE,               /* metadata may be changed */
 202         PM_OUT_OF_DATA_SPACE,   /* metadata may be changed, though data may not be allocated */
 203
 204         /*
 205          * Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
 206          */
 207         PM_OUT_OF_METADATA_SPACE,
 208         PM_READ_ONLY,           /* metadata may not be changed */
 209
 210         PM_FAIL,                /* all I/O fails */
 211 };
 212
 213 struct pool_features {
 214         enum pool_mode mode;
 215
 216         bool zero_new_blocks:1;
 217         bool discard_enabled:1;
 218         bool discard_passdown:1;
 219         bool error_if_no_space:1;
 220 };
 221
 222 struct thin_c;
 223 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
 224 typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
 225 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
 226
 227 #define CELL_SORT_ARRAY_SIZE 8192
 228
 229 struct pool {
 230         struct list_head list;
 231         struct dm_target *ti;   /* Only set if a pool target is bound */
 232
 233         struct mapped_device *pool_md;
 234         struct block_device *md_dev;
 235         struct dm_pool_metadata *pmd;
 236
 237         dm_block_t low_water_blocks;
 238         uint32_t sectors_per_block;
 239         int sectors_per_block_shift;
 240
 241         struct pool_features pf;
 242         bool low_water_triggered:1;     /* A dm event has been sent */
 243         bool suspended:1;
 244         bool out_of_data_space:1;
 245
 246         struct dm_bio_prison *prison;
 247         struct dm_kcopyd_client *copier;
 248
 249         struct work_struct worker;
 250         struct workqueue_struct *wq;
 251         struct throttle throttle;
 252         struct delayed_work waker;
 253         struct delayed_work no_space_timeout;
 254
 255         unsigned long last_commit_jiffies;
 256         unsigned ref_count;
 257
 258         spinlock_t lock;
 259         struct bio_list deferred_flush_bios;
 260         struct list_head prepared_mappings;
 261         struct list_head prepared_discards;
 262         struct list_head prepared_discards_pt2;
 263         struct list_head active_thins;
 264
 265         struct dm_deferred_set *shared_read_ds;
 266         struct dm_deferred_set *all_io_ds;
 267
 268         struct dm_thin_new_mapping *next_mapping;
 269
 270         process_bio_fn process_bio;
 271         process_bio_fn process_discard;
 272
 273         process_cell_fn process_cell;
 274         process_cell_fn process_discard_cell;
 275
 276         process_mapping_fn process_prepared_mapping;
 277         process_mapping_fn process_prepared_discard;
 278         process_mapping_fn process_prepared_discard_pt2;
 279
 280         struct dm_bio_prison_cell **cell_sort_array;
 281
 282         mempool_t mapping_pool;
 283 };
 284
 285 static void metadata_operation_failed(struct pool *pool, const char *op, int r);
 286
 287 static enum pool_mode get_pool_mode(struct pool *pool)
 288 {
 289         return pool->pf.mode;
 290 }
 291
 292 static void notify_of_pool_mode_change(struct pool *pool)
 293 {
 294         const char *descs[] = {
 295                 "write",
 296                 "out-of-data-space",
 297                 "read-only",
 298                 "read-only",
 299                 "fail"
 300         };
 301         const char *extra_desc = NULL;
 302         enum pool_mode mode = get_pool_mode(pool);
 303
 304         if (mode == PM_OUT_OF_DATA_SPACE) {
 305                 if (!pool->pf.error_if_no_space)
 306                         extra_desc = " (queue IO)";
 307                 else
 308                         extra_desc = " (error IO)";
 309         }
 310
 311         dm_table_event(pool->ti->table);
 312         DMINFO("%s: switching pool to %s%s mode",
 313                dm_device_name(pool->pool_md),
 314                descs[(int)mode], extra_desc ? : "");
 315 }
 316
 317 /*
 318  * Target context for a pool.
 319  */
 320 struct pool_c {
 321         struct dm_target *ti;
 322         struct pool *pool;
 323         struct dm_dev *data_dev;
 324         struct dm_dev *metadata_dev;
 325         struct dm_target_callbacks callbacks;
 326
 327         dm_block_t low_water_blocks;
 328         struct pool_features requested_pf; /* Features requested during table load */
 329         struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
 330 };
 331
 332 /*
 333  * Target context for a thin.
 334  */
 335 struct thin_c {
 336         struct list_head list;
 337         struct dm_dev *pool_dev;
 338         struct dm_dev *origin_dev;
 339         sector_t origin_size;
 340         dm_thin_id dev_id;
 341
 342         struct pool *pool;
 343         struct dm_thin_device *td;
 344         struct mapped_device *thin_md;
 345
 346         bool requeue_mode:1;
 347         spinlock_t lock;
 348         struct list_head deferred_cells;
 349         struct bio_list deferred_bio_list;
 350         struct bio_list retry_on_resume_list;
 351         struct rb_root sort_bio_list; /* sorted list of deferred bios */
 352
 353         /*
 354          * Ensures the thin is not destroyed until the worker has finished
 355          * iterating the active_thins list.
 356          */
 357         refcount_t refcount;
 358         struct completion can_destroy;
 359 };
 360
 361 /*----------------------------------------------------------------*/
 362
 363 static bool block_size_is_power_of_two(struct pool *pool)
 364 {
 365         return pool->sectors_per_block_shift >= 0;
 366 }
 367
 368 static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
 369 {
 370         return block_size_is_power_of_two(pool) ?
 371                 (b << pool->sectors_per_block_shift) :
 372                 (b * pool->sectors_per_block);
 373 }
 374
 375 /*----------------------------------------------------------------*/
 376
 377 struct discard_op {
 378         struct thin_c *tc;
 379         struct blk_plug plug;
 380         struct bio *parent_bio;
 381         struct bio *bio;
 382 };
 383
 384 static void begin_discard(struct discard_op *op, struct thin_c *tc, struct bio *parent)
 385 {
 386         BUG_ON(!parent);
 387
 388         op->tc = tc;
 389         blk_start_plug(&op->plug);
 390         op->parent_bio = parent;
 391         op->bio = NULL;
 392 }
 393
 394 static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e)
 395 {
 396         struct thin_c *tc = op->tc;
 397         sector_t s = block_to_sectors(tc->pool, data_b);
 398         sector_t len = block_to_sectors(tc->pool, data_e - data_b);
 399
 400         return __blkdev_issue_discard(tc->pool_dev->bdev, s, len,
 401                                       GFP_NOWAIT, 0, &op->bio);
 402 }
 403
 404 static void end_discard(struct discard_op *op, int r)
 405 {
 406         if (op->bio) {
 407                 /*
 408                  * Even if one of the calls to issue_discard failed, we
 409                  * need to wait for the chain to complete.
 410                  */
 411                 bio_chain(op->bio, op->parent_bio);
 412                 bio_set_op_attrs(op->bio, REQ_OP_DISCARD, 0);
 413                 submit_bio(op->bio);
 414         }
 415
 416         blk_finish_plug(&op->plug);
 417
 418         /*
 419          * Even if r is set, there could be sub discards in flight that we
 420          * need to wait for.
 421          */
 422         if (r && !op->parent_bio->bi_status)
 423                 op->parent_bio->bi_status = errno_to_blk_status(r);
 424         bio_endio(op->parent_bio);
 425 }
 426
 427 /*----------------------------------------------------------------*/
 428
 429 /*
 430  * wake_worker() is used when new work is queued and when pool_resume is
 431  * ready to continue deferred IO processing.
 432  */
 433 static void wake_worker(struct pool *pool)
 434 {
 435         queue_work(pool->wq, &pool->worker);
 436 }
 437
 438 /*----------------------------------------------------------------*/
 439
 440 static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
 441                       struct dm_bio_prison_cell **cell_result)
 442 {
 443         int r;
 444         struct dm_bio_prison_cell *cell_prealloc;
 445
 446         /*
 447          * Allocate a cell from the prison's mempool.
 448          * This might block but it can't fail.
 449          */
 450         cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
 451
 452         r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
 453         if (r)
 454                 /*
 455                  * We reused an old cell; we can get rid of
 456                  * the new one.
 457                  */
 458                 dm_bio_prison_free_cell(pool->prison, cell_prealloc);
 459
 460         return r;
 461 }
 462
 463 static void cell_release(struct pool *pool,
 464                          struct dm_bio_prison_cell *cell,
 465                          struct bio_list *bios)
 466 {
 467         dm_cell_release(pool->prison, cell, bios);
 468         dm_bio_prison_free_cell(pool->prison, cell);
 469 }
 470
 471 static void cell_visit_release(struct pool *pool,
 472                                void (*fn)(void *, struct dm_bio_prison_cell *),
 473                                void *context,
 474                                struct dm_bio_prison_cell *cell)
 475 {
 476         dm_cell_visit_release(pool->prison, fn, context, cell);
 477         dm_bio_prison_free_cell(pool->prison, cell);
 478 }
 479
 480 static void cell_release_no_holder(struct pool *pool,
 481                                    struct dm_bio_prison_cell *cell,
 482                                    struct bio_list *bios)
 483 {
 484         dm_cell_release_no_holder(pool->prison, cell, bios);
 485         dm_bio_prison_free_cell(pool->prison, cell);
 486 }
 487
 488 static void cell_error_with_code(struct pool *pool,
 489                 struct dm_bio_prison_cell *cell, blk_status_t error_code)
 490 {
 491         dm_cell_error(pool->prison, cell, error_code);
 492         dm_bio_prison_free_cell(pool->prison, cell);
 493 }
 494
 495 static blk_status_t get_pool_io_error_code(struct pool *pool)
 496 {
 497         return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR;
 498 }
 499
 500 static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
 501 {
 502         cell_error_with_code(pool, cell, get_pool_io_error_code(pool));
 503 }
 504
 505 static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
 506 {
 507         cell_error_with_code(pool, cell, 0);
 508 }
 509
 510 static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
 511 {
 512         cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE);
 513 }
 514
 515 /*----------------------------------------------------------------*/
 516
 517 /*
 518  * A global list of pools that uses a struct mapped_device as a key.
 519  */
 520 static struct dm_thin_pool_table {
 521         struct mutex mutex;
 522         struct list_head pools;
 523 } dm_thin_pool_table;
 524
 525 static void pool_table_init(void)
 526 {
 527         mutex_init(&dm_thin_pool_table.mutex);
 528         INIT_LIST_HEAD(&dm_thin_pool_table.pools);
 529 }
 530
 531 static void pool_table_exit(void)
 532 {
 533         mutex_destroy(&dm_thin_pool_table.mutex);
 534 }
 535
 536 static void __pool_table_insert(struct pool *pool)
 537 {
 538         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 539         list_add(&pool->list, &dm_thin_pool_table.pools);
 540 }
 541
 542 static void __pool_table_remove(struct pool *pool)
 543 {
 544         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 545         list_del(&pool->list);
 546 }
 547
 548 static struct pool *__pool_table_lookup(struct mapped_device *md)
 549 {
 550         struct pool *pool = NULL, *tmp;
 551
 552         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 553
 554         list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 555                 if (tmp->pool_md == md) {
 556                         pool = tmp;
 557                         break;
 558                 }
 559         }
 560
 561         return pool;
 562 }
 563
 564 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
 565 {
 566         struct pool *pool = NULL, *tmp;
 567
 568         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 569
 570         list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 571                 if (tmp->md_dev == md_dev) {
 572                         pool = tmp;
 573                         break;
 574                 }
 575         }
 576
 577         return pool;
 578 }
 579
 580 /*----------------------------------------------------------------*/
 581
 582 struct dm_thin_endio_hook {
 583         struct thin_c *tc;
 584         struct dm_deferred_entry *shared_read_entry;
 585         struct dm_deferred_entry *all_io_entry;
 586         struct dm_thin_new_mapping *overwrite_mapping;
 587         struct rb_node rb_node;
 588         struct dm_bio_prison_cell *cell;
 589 };
 590
 591 static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
 592 {
 593         bio_list_merge(bios, master);
 594         bio_list_init(master);
 595 }
 596
 597 static void error_bio_list(struct bio_list *bios, blk_status_t error)
 598 {
 599         struct bio *bio;
 600
 601         while ((bio = bio_list_pop(bios))) {
 602                 bio->bi_status = error;
 603                 bio_endio(bio);
 604         }
 605 }
 606
 607 static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
 608                 blk_status_t error)
 609 {
 610         struct bio_list bios;
 611         unsigned long flags;
 612
 613         bio_list_init(&bios);
 614
 615         spin_lock_irqsave(&tc->lock, flags);
 616         __merge_bio_list(&bios, master);
 617         spin_unlock_irqrestore(&tc->lock, flags);
 618
 619         error_bio_list(&bios, error);
 620 }
 621
 622 static void requeue_deferred_cells(struct thin_c *tc)
 623 {
 624         struct pool *pool = tc->pool;
 625         unsigned long flags;
 626         struct list_head cells;
 627         struct dm_bio_prison_cell *cell, *tmp;
 628
 629         INIT_LIST_HEAD(&cells);
 630
 631         spin_lock_irqsave(&tc->lock, flags);
 632         list_splice_init(&tc->deferred_cells, &cells);
 633         spin_unlock_irqrestore(&tc->lock, flags);
 634
 635         list_for_each_entry_safe(cell, tmp, &cells, user_list)
 636                 cell_requeue(pool, cell);
 637 }
 638
 639 static void requeue_io(struct thin_c *tc)
 640 {
 641         struct bio_list bios;
 642         unsigned long flags;
 643
 644         bio_list_init(&bios);
 645
 646         spin_lock_irqsave(&tc->lock, flags);
 647         __merge_bio_list(&bios, &tc->deferred_bio_list);
 648         __merge_bio_list(&bios, &tc->retry_on_resume_list);
 649         spin_unlock_irqrestore(&tc->lock, flags);
 650
 651         error_bio_list(&bios, BLK_STS_DM_REQUEUE);
 652         requeue_deferred_cells(tc);
 653 }
 654
 655 static void error_retry_list_with_code(struct pool *pool, blk_status_t error)
 656 {
 657         struct thin_c *tc;
 658
 659         rcu_read_lock();
 660         list_for_each_entry_rcu(tc, &pool->active_thins, list)
 661                 error_thin_bio_list(tc, &tc->retry_on_resume_list, error);
 662         rcu_read_unlock();
 663 }
 664
 665 static void error_retry_list(struct pool *pool)
 666 {
 667         error_retry_list_with_code(pool, get_pool_io_error_code(pool));
 668 }
 669
 670 /*
 671  * This section of code contains the logic for processing a thin device's IO.
 672  * Much of the code depends on pool object resources (lists, workqueues, etc)
 673  * but most is exclusively called from the thin target rather than the thin-pool
 674  * target.
 675  */
 676
 677 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 678 {
 679         struct pool *pool = tc->pool;
 680         sector_t block_nr = bio->bi_iter.bi_sector;
 681
 682         if (block_size_is_power_of_two(pool))
 683                 block_nr >>= pool->sectors_per_block_shift;
 684         else
 685                 (void) sector_div(block_nr, pool->sectors_per_block);
 686
 687         return block_nr;
 688 }
 689
 690 /*
 691  * Returns the _complete_ blocks that this bio covers.
 692  */
 693 static void get_bio_block_range(struct thin_c *tc, struct bio *bio,
 694                                 dm_block_t *begin, dm_block_t *end)
 695 {
 696         struct pool *pool = tc->pool;
 697         sector_t b = bio->bi_iter.bi_sector;
 698         sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
 699
 700         b += pool->sectors_per_block - 1ull; /* so we round up */
 701
 702         if (block_size_is_power_of_two(pool)) {
 703                 b >>= pool->sectors_per_block_shift;
 704                 e >>= pool->sectors_per_block_shift;
 705         } else {
 706                 (void) sector_div(b, pool->sectors_per_block);
 707                 (void) sector_div(e, pool->sectors_per_block);
 708         }
 709
 710         if (e < b)
 711                 /* Can happen if the bio is within a single block. */
 712                 e = b;
 713
 714         *begin = b;
 715         *end = e;
 716 }
 717
 718 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 719 {
 720         struct pool *pool = tc->pool;
 721         sector_t bi_sector = bio->bi_iter.bi_sector;
 722
 723         bio_set_dev(bio, tc->pool_dev->bdev);
 724         if (block_size_is_power_of_two(pool))
 725                 bio->bi_iter.bi_sector =
 726                         (block << pool->sectors_per_block_shift) |
 727                         (bi_sector & (pool->sectors_per_block - 1));
 728         else
 729                 bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
 730                                  sector_div(bi_sector, pool->sectors_per_block);
 731 }
 732
 733 static void remap_to_origin(struct thin_c *tc, struct bio *bio)
 734 {
 735         bio_set_dev(bio, tc->origin_dev->bdev);
 736 }
 737
 738 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
 739 {
 740         return op_is_flush(bio->bi_opf) &&
 741                 dm_thin_changed_this_transaction(tc->td);
 742 }
 743
 744 static void inc_all_io_entry(struct pool *pool, struct bio *bio)
 745 {
 746         struct dm_thin_endio_hook *h;
 747
 748         if (bio_op(bio) == REQ_OP_DISCARD)
 749                 return;
 750
 751         h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 752         h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
 753 }
 754
 755 static void issue(struct thin_c *tc, struct bio *bio)
 756 {
 757         struct pool *pool = tc->pool;
 758         unsigned long flags;
 759
 760         if (!bio_triggers_commit(tc, bio)) {
 761                 generic_make_request(bio);
 762                 return;
 763         }
 764
 765         /*
 766          * Complete bio with an error if earlier I/O caused changes to
 767          * the metadata that can't be committed e.g, due to I/O errors
 768          * on the metadata device.
 769          */
 770         if (dm_thin_aborted_changes(tc->td)) {
 771                 bio_io_error(bio);
 772                 return;
 773         }
 774
 775         /*
 776          * Batch together any bios that trigger commits and then issue a
 777          * single commit for them in process_deferred_bios().
 778          */
 779         spin_lock_irqsave(&pool->lock, flags);
 780         bio_list_add(&pool->deferred_flush_bios, bio);
 781         spin_unlock_irqrestore(&pool->lock, flags);
 782 }
 783
 784 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
 785 {
 786         remap_to_origin(tc, bio);
 787         issue(tc, bio);
 788 }
 789
 790 static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 791                             dm_block_t block)
 792 {
 793         remap(tc, bio, block);
 794         issue(tc, bio);
 795 }
 796
 797 /*----------------------------------------------------------------*/
 798
 799 /*
 800  * Bio endio functions.
 801  */
 802 struct dm_thin_new_mapping {
 803         struct list_head list;
 804
 805         bool pass_discard:1;
 806         bool maybe_shared:1;
 807
 808         /*
 809          * Track quiescing, copying and zeroing preparation actions.  When this
 810          * counter hits zero the block is prepared and can be inserted into the
 811          * btree.
 812          */
 813         atomic_t prepare_actions;
 814
 815         blk_status_t status;
 816         struct thin_c *tc;
 817         dm_block_t virt_begin, virt_end;
 818         dm_block_t data_block;
 819         struct dm_bio_prison_cell *cell;
 820
 821         /*
 822          * If the bio covers the whole area of a block then we can avoid
 823          * zeroing or copying.  Instead this bio is hooked.  The bio will
 824          * still be in the cell, so care has to be taken to avoid issuing
 825          * the bio twice.
 826          */
 827         struct bio *bio;
 828         bio_end_io_t *saved_bi_end_io;
 829 };
 830
 831 static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
 832 {
 833         struct pool *pool = m->tc->pool;
 834
 835         if (atomic_dec_and_test(&m->prepare_actions)) {
 836                 list_add_tail(&m->list, &pool->prepared_mappings);
 837                 wake_worker(pool);
 838         }
 839 }
 840
 841 static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
 842 {
 843         unsigned long flags;
 844         struct pool *pool = m->tc->pool;
 845
 846         spin_lock_irqsave(&pool->lock, flags);
 847         __complete_mapping_preparation(m);
 848         spin_unlock_irqrestore(&pool->lock, flags);
 849 }
 850
 851 static void copy_complete(int read_err, unsigned long write_err, void *context)
 852 {
 853         struct dm_thin_new_mapping *m = context;
 854
 855         m->status = read_err || write_err ? BLK_STS_IOERR : 0;
 856         complete_mapping_preparation(m);
 857 }
 858
 859 static void overwrite_endio(struct bio *bio)
 860 {
 861         struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 862         struct dm_thin_new_mapping *m = h->overwrite_mapping;
 863
 864         bio->bi_end_io = m->saved_bi_end_io;
 865
 866         m->status = bio->bi_status;
 867         complete_mapping_preparation(m);
 868 }
 869
 870 /*----------------------------------------------------------------*/
 871
 872 /*
 873  * Workqueue.
 874  */
 875
 876 /*
 877  * Prepared mapping jobs.
 878  */
 879
 880 /*
 881  * This sends the bios in the cell, except the original holder, back
 882  * to the deferred_bios list.
 883  */
 884 static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 885 {
 886         struct pool *pool = tc->pool;
 887         unsigned long flags;
 888
 889         spin_lock_irqsave(&tc->lock, flags);
 890         cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
 891         spin_unlock_irqrestore(&tc->lock, flags);
 892
 893         wake_worker(pool);
 894 }
 895
 896 static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
 897
 898 struct remap_info {
 899         struct thin_c *tc;
 900         struct bio_list defer_bios;
 901         struct bio_list issue_bios;
 902 };
 903
 904 static void __inc_remap_and_issue_cell(void *context,
 905                                        struct dm_bio_prison_cell *cell)
 906 {
 907         struct remap_info *info = context;
 908         struct bio *bio;
 909
 910         while ((bio = bio_list_pop(&cell->bios))) {
 911                 if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD)
 912                         bio_list_add(&info->defer_bios, bio);
 913                 else {
 914                         inc_all_io_entry(info->tc->pool, bio);
 915
 916                         /*
 917                          * We can't issue the bios with the bio prison lock
 918                          * held, so we add them to a list to issue on
 919                          * return from this function.
 920                          */
 921                         bio_list_add(&info->issue_bios, bio);
 922                 }
 923         }
 924 }
 925
 926 static void inc_remap_and_issue_cell(struct thin_c *tc,
 927                                      struct dm_bio_prison_cell *cell,
 928                                      dm_block_t block)
 929 {
 930         struct bio *bio;
 931         struct remap_info info;
 932
 933         info.tc = tc;
 934         bio_list_init(&info.defer_bios);
 935         bio_list_init(&info.issue_bios);
 936
 937         /*
 938          * We have to be careful to inc any bios we're about to issue
 939          * before the cell is released, and avoid a race with new bios
 940          * being added to the cell.
 941          */
 942         cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
 943                            &info, cell);
 944
 945         while ((bio = bio_list_pop(&info.defer_bios)))
 946                 thin_defer_bio(tc, bio);
 947
 948         while ((bio = bio_list_pop(&info.issue_bios)))
 949                 remap_and_issue(info.tc, bio, block);
 950 }
 951
 952 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 953 {
 954         cell_error(m->tc->pool, m->cell);
 955         list_del(&m->list);
 956         mempool_free(m, &m->tc->pool->mapping_pool);
 957 }
 958
 959 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 960 {
 961         struct thin_c *tc = m->tc;
 962         struct pool *pool = tc->pool;
 963         struct bio *bio = m->bio;
 964         int r;
 965
 966         if (m->status) {
 967                 cell_error(pool, m->cell);
 968                 goto out;
 969         }
 970
 971         /*
 972          * Commit the prepared block into the mapping btree.
 973          * Any I/O for this block arriving after this point will get
 974          * remapped to it directly.
 975          */
 976         r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
 977         if (r) {
 978                 metadata_operation_failed(pool, "dm_thin_insert_block", r);
 979                 cell_error(pool, m->cell);
 980                 goto out;
 981         }
 982
 983         /*
 984          * Release any bios held while the block was being provisioned.
 985          * If we are processing a write bio that completely covers the block,
 986          * we already processed it so can ignore it now when processing
 987          * the bios in the cell.
 988          */
 989         if (bio) {
 990                 inc_remap_and_issue_cell(tc, m->cell, m->data_block);
 991                 bio_endio(bio);
 992         } else {
 993                 inc_all_io_entry(tc->pool, m->cell->holder);
 994                 remap_and_issue(tc, m->cell->holder, m->data_block);
 995                 inc_remap_and_issue_cell(tc, m->cell, m->data_block);
 996         }
 997
 998 out:
 999         list_del(&m->list);
1000         mempool_free(m, &pool->mapping_pool);
1001 }
1002
1003 /*----------------------------------------------------------------*/
1004
1005 static void free_discard_mapping(struct dm_thin_new_mapping *m)
1006 {
1007         struct thin_c *tc = m->tc;
1008         if (m->cell)
1009                 cell_defer_no_holder(tc, m->cell);
1010         mempool_free(m, &tc->pool->mapping_pool);
1011 }
1012
1013 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
1014 {
1015         bio_io_error(m->bio);
1016         free_discard_mapping(m);
1017 }
1018
1019 static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
1020 {
1021         bio_endio(m->bio);
1022         free_discard_mapping(m);
1023 }
1024
1025 static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
1026 {
1027         int r;
1028         struct thin_c *tc = m->tc;
1029
1030         r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
1031         if (r) {
1032                 metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
1033                 bio_io_error(m->bio);
1034         } else
1035                 bio_endio(m->bio);
1036
1037         cell_defer_no_holder(tc, m->cell);
1038         mempool_free(m, &tc->pool->mapping_pool);
1039 }
1040
1041 /*----------------------------------------------------------------*/
1042
1043 static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m,
1044                                                    struct bio *discard_parent)
1045 {
1046         /*
1047          * We've already unmapped this range of blocks, but before we
1048          * passdown we have to check that these blocks are now unused.
1049          */
1050         int r = 0;
1051         bool shared = true;
1052         struct thin_c *tc = m->tc;
1053         struct pool *pool = tc->pool;
1054         dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
1055         struct discard_op op;
1056
1057         begin_discard(&op, tc, discard_parent);
1058         while (b != end) {
1059                 /* find start of unmapped run */
1060                 for (; b < end; b++) {
1061                         r = dm_pool_block_is_shared(pool->pmd, b, &shared);
1062                         if (r)
1063                                 goto out;
1064
1065                         if (!shared)
1066                                 break;
1067                 }
1068
1069                 if (b == end)
1070                         break;
1071
1072                 /* find end of run */
1073                 for (e = b + 1; e != end; e++) {
1074                         r = dm_pool_block_is_shared(pool->pmd, e, &shared);
1075                         if (r)
1076                                 goto out;
1077
1078                         if (shared)
1079                                 break;
1080                 }
1081
1082                 r = issue_discard(&op, b, e);
1083                 if (r)
1084                         goto out;
1085
1086                 b = e;
1087         }
1088 out:
1089         end_discard(&op, r);
1090 }
1091
1092 static void queue_passdown_pt2(struct dm_thin_new_mapping *m)
1093 {
1094         unsigned long flags;
1095         struct pool *pool = m->tc->pool;
1096
1097         spin_lock_irqsave(&pool->lock, flags);
1098         list_add_tail(&m->list, &pool->prepared_discards_pt2);
1099         spin_unlock_irqrestore(&pool->lock, flags);
1100         wake_worker(pool);
1101 }
1102
1103 static void passdown_endio(struct bio *bio)
1104 {
1105         /*
1106          * It doesn't matter if the passdown discard failed, we still want
1107          * to unmap (we ignore err).
1108          */
1109         queue_passdown_pt2(bio->bi_private);
1110         bio_put(bio);
1111 }
1112
1113 static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
1114 {
1115         int r;
1116         struct thin_c *tc = m->tc;
1117         struct pool *pool = tc->pool;
1118         struct bio *discard_parent;
1119         dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin);
1120
1121         /*
1122          * Only this thread allocates blocks, so we can be sure that the
1123          * newly unmapped blocks will not be allocated before the end of
1124          * the function.
1125          */
1126         r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
1127         if (r) {
1128                 metadata_operation_failed(pool, "dm_thin_remove_range", r);
1129                 bio_io_error(m->bio);
1130                 cell_defer_no_holder(tc, m->cell);
1131                 mempool_free(m, &pool->mapping_pool);
1132                 return;
1133         }
1134
1135         /*
1136          * Increment the unmapped blocks.  This prevents a race between the
1137          * passdown io and reallocation of freed blocks.
1138          */
1139         r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
1140         if (r) {
1141                 metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
1142                 bio_io_error(m->bio);
1143                 cell_defer_no_holder(tc, m->cell);
1144                 mempool_free(m, &pool->mapping_pool);
1145                 return;
1146         }
1147
1148         discard_parent = bio_alloc(GFP_NOIO, 1);
1149         if (!discard_parent) {
1150                 DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.",
1151                        dm_device_name(tc->pool->pool_md));
1152                 queue_passdown_pt2(m);
1153
1154         } else {
1155                 discard_parent->bi_end_io = passdown_endio;
1156                 discard_parent->bi_private = m;
1157
1158                 if (m->maybe_shared)
1159                         passdown_double_checking_shared_status(m, discard_parent);
1160                 else {
1161                         struct discard_op op;
1162
1163                         begin_discard(&op, tc, discard_parent);
1164                         r = issue_discard(&op, m->data_block, data_end);
1165                         end_discard(&op, r);
1166                 }
1167         }
1168 }
1169
1170 static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
1171 {
1172         int r;
1173         struct thin_c *tc = m->tc;
1174         struct pool *pool = tc->pool;
1175
1176         /*
1177          * The passdown has completed, so now we can decrement all those
1178          * unmapped blocks.
1179          */
1180         r = dm_pool_dec_data_range(pool->pmd, m->data_block,
1181                                    m->data_block + (m->virt_end - m->virt_begin));
1182         if (r) {
1183                 metadata_operation_failed(pool, "dm_pool_dec_data_range", r);
1184                 bio_io_error(m->bio);
1185         } else
1186                 bio_endio(m->bio);
1187
1188         cell_defer_no_holder(tc, m->cell);
1189         mempool_free(m, &pool->mapping_pool);
1190 }
1191
1192 static void process_prepared(struct pool *pool, struct list_head *head,
1193                              process_mapping_fn *fn)
1194 {
1195         unsigned long flags;
1196         struct list_head maps;
1197         struct dm_thin_new_mapping *m, *tmp;
1198
1199         INIT_LIST_HEAD(&maps);
1200         spin_lock_irqsave(&pool->lock, flags);
1201         list_splice_init(head, &maps);
1202         spin_unlock_irqrestore(&pool->lock, flags);
1203
1204         list_for_each_entry_safe(m, tmp, &maps, list)
1205                 (*fn)(m);
1206 }
1207
1208 /*
1209  * Deferred bio jobs.
1210  */
1211 static int io_overlaps_block(struct pool *pool, struct bio *bio)
1212 {
1213         return bio->bi_iter.bi_size ==
1214                 (pool->sectors_per_block << SECTOR_SHIFT);
1215 }
1216
1217 static int io_overwrites_block(struct pool *pool, struct bio *bio)
1218 {
1219         return (bio_data_dir(bio) == WRITE) &&
1220                 io_overlaps_block(pool, bio);
1221 }
1222
1223 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
1224                                bio_end_io_t *fn)
1225 {
1226         *save = bio->bi_end_io;
1227         bio->bi_end_io = fn;
1228 }
1229
1230 static int ensure_next_mapping(struct pool *pool)
1231 {
1232         if (pool->next_mapping)
1233                 return 0;
1234
1235         pool->next_mapping = mempool_alloc(&pool->mapping_pool, GFP_ATOMIC);
1236
1237         return pool->next_mapping ? 0 : -ENOMEM;
1238 }
1239
1240 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
1241 {
1242         struct dm_thin_new_mapping *m = pool->next_mapping;
1243
1244         BUG_ON(!pool->next_mapping);
1245
1246         memset(m, 0, sizeof(struct dm_thin_new_mapping));
1247         INIT_LIST_HEAD(&m->list);
1248         m->bio = NULL;
1249
1250         pool->next_mapping = NULL;
1251
1252         return m;
1253 }
1254
1255 static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
1256                     sector_t begin, sector_t end)
1257 {
1258         struct dm_io_region to;
1259
1260         to.bdev = tc->pool_dev->bdev;
1261         to.sector = begin;
1262         to.count = end - begin;
1263
1264         dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
1265 }
1266
1267 static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
1268                                       dm_block_t data_begin,
1269                                       struct dm_thin_new_mapping *m)
1270 {
1271         struct pool *pool = tc->pool;
1272         struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1273
1274         h->overwrite_mapping = m;
1275         m->bio = bio;
1276         save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
1277         inc_all_io_entry(pool, bio);
1278         remap_and_issue(tc, bio, data_begin);
1279 }
1280
1281 /*
1282  * A partial copy also needs to zero the uncopied region.
1283  */
1284 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
1285                           struct dm_dev *origin, dm_block_t data_origin,
1286                           dm_block_t data_dest,
1287                           struct dm_bio_prison_cell *cell, struct bio *bio,
1288                           sector_t len)
1289 {
1290         struct pool *pool = tc->pool;
1291         struct dm_thin_new_mapping *m = get_next_mapping(pool);
1292
1293         m->tc = tc;
1294         m->virt_begin = virt_block;
1295         m->virt_end = virt_block + 1u;
1296         m->data_block = data_dest;
1297         m->cell = cell;
1298
1299         /*
1300          * quiesce action + copy action + an extra reference held for the
1301          * duration of this function (we may need to inc later for a
1302          * partial zero).
1303          */
1304         atomic_set(&m->prepare_actions, 3);
1305
1306         if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
1307                 complete_mapping_preparation(m); /* already quiesced */
1308
1309         /*
1310          * IO to pool_dev remaps to the pool target's data_dev.
1311          *
1312          * If the whole block of data is being overwritten, we can issue the
1313          * bio immediately. Otherwise we use kcopyd to clone the data first.
1314          */
1315         if (io_overwrites_block(pool, bio))
1316                 remap_and_issue_overwrite(tc, bio, data_dest, m);
1317         else {
1318                 struct dm_io_region from, to;
1319
1320                 from.bdev = origin->bdev;
1321                 from.sector = data_origin * pool->sectors_per_block;
1322                 from.count = len;
1323
1324                 to.bdev = tc->pool_dev->bdev;
1325                 to.sector = data_dest * pool->sectors_per_block;
1326                 to.count = len;
1327
1328                 dm_kcopyd_copy(pool->copier, &from, 1, &to,
1329                                0, copy_complete, m);
1330
1331                 /*
1332                  * Do we need to zero a tail region?
1333                  */
1334                 if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
1335                         atomic_inc(&m->prepare_actions);
1336                         ll_zero(tc, m,
1337                                 data_dest * pool->sectors_per_block + len,
1338                                 (data_dest + 1) * pool->sectors_per_block);
1339                 }
1340         }
1341
1342         complete_mapping_preparation(m); /* drop our ref */
1343 }
1344
1345 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
1346                                    dm_block_t data_origin, dm_block_t data_dest,
1347                                    struct dm_bio_prison_cell *cell, struct bio *bio)
1348 {
1349         schedule_copy(tc, virt_block, tc->pool_dev,
1350                       data_origin, data_dest, cell, bio,
1351                       tc->pool->sectors_per_block);
1352 }
1353
1354 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
1355                           dm_block_t data_block, struct dm_bio_prison_cell *cell,
1356                           struct bio *bio)
1357 {
1358         struct pool *pool = tc->pool;
1359         struct dm_thin_new_mapping *m = get_next_mapping(pool);
1360
1361         atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
1362         m->tc = tc;
1363         m->virt_begin = virt_block;
1364         m->virt_end = virt_block + 1u;
1365         m->data_block = data_block;
1366         m->cell = cell;
1367
1368         /*
1369          * If the whole block of data is being overwritten or we are not
1370          * zeroing pre-existing data, we can issue the bio immediately.
1371          * Otherwise we use kcopyd to zero the data first.
1372          */
1373         if (pool->pf.zero_new_blocks) {
1374                 if (io_overwrites_block(pool, bio))
1375                         remap_and_issue_overwrite(tc, bio, data_block, m);
1376                 else
1377                         ll_zero(tc, m, data_block * pool->sectors_per_block,
1378                                 (data_block + 1) * pool->sectors_per_block);
1379         } else
1380                 process_prepared_mapping(m);
1381 }
1382
1383 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
1384                                    dm_block_t data_dest,
1385                                    struct dm_bio_prison_cell *cell, struct bio *bio)
1386 {
1387         struct pool *pool = tc->pool;
1388         sector_t virt_block_begin = virt_block * pool->sectors_per_block;
1389         sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
1390
1391         if (virt_block_end <= tc->origin_size)
1392                 schedule_copy(tc, virt_block, tc->origin_dev,
1393                               virt_block, data_dest, cell, bio,
1394                               pool->sectors_per_block);
1395
1396         else if (virt_block_begin < tc->origin_size)
1397                 schedule_copy(tc, virt_block, tc->origin_dev,
1398                               virt_block, data_dest, cell, bio,
1399                               tc->origin_size - virt_block_begin);
1400
1401         else
1402                 schedule_zero(tc, virt_block, data_dest, cell, bio);
1403 }
1404
1405 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
1406
1407 static void requeue_bios(struct pool *pool);
1408
1409 static bool is_read_only_pool_mode(enum pool_mode mode)
1410 {
1411         return (mode == PM_OUT_OF_METADATA_SPACE || mode == PM_READ_ONLY);
1412 }
1413
1414 static bool is_read_only(struct pool *pool)
1415 {
1416         return is_read_only_pool_mode(get_pool_mode(pool));
1417 }
1418
1419 static void check_for_metadata_space(struct pool *pool)
1420 {
1421         int r;
1422         const char *ooms_reason = NULL;
1423         dm_block_t nr_free;
1424
1425         r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free);
1426         if (r)
1427                 ooms_reason = "Could not get free metadata blocks";
1428         else if (!nr_free)
1429                 ooms_reason = "No free metadata blocks";
1430
1431         if (ooms_reason && !is_read_only(pool)) {
1432                 DMERR("%s", ooms_reason);
1433                 set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE);
1434         }
1435 }
1436
1437 static void check_for_data_space(struct pool *pool)
1438 {
1439         int r;
1440         dm_block_t nr_free;
1441
1442         if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE)
1443                 return;
1444
1445         r = dm_pool_get_free_block_count(pool->pmd, &nr_free);
1446         if (r)
1447                 return;
1448
1449         if (nr_free) {
1450                 set_pool_mode(pool, PM_WRITE);
1451                 requeue_bios(pool);
1452         }
1453 }
1454
1455 /*
1456  * A non-zero return indicates read_only or fail_io mode.
1457  * Many callers don't care about the return value.
1458  */
1459 static int commit(struct pool *pool)
1460 {
1461         int r;
1462
1463         if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
1464                 return -EINVAL;
1465
1466         r = dm_pool_commit_metadata(pool->pmd);
1467         if (r)
1468                 metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
1469         else {
1470                 check_for_metadata_space(pool);
1471                 check_for_data_space(pool);
1472         }
1473
1474         return r;
1475 }
1476
1477 static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
1478 {
1479         unsigned long flags;
1480
1481         if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
1482                 DMWARN("%s: reached low water mark for data device: sending event.",
1483                        dm_device_name(pool->pool_md));
1484                 spin_lock_irqsave(&pool->lock, flags);
1485                 pool->low_water_triggered = true;
1486                 spin_unlock_irqrestore(&pool->lock, flags);
1487                 dm_table_event(pool->ti->table);
1488         }
1489 }
1490
1491 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1492 {
1493         int r;
1494         dm_block_t free_blocks;
1495         struct pool *pool = tc->pool;
1496
1497         if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
1498                 return -EINVAL;
1499
1500         r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1501         if (r) {
1502                 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
1503                 return r;
1504         }
1505
1506         check_low_water_mark(pool, free_blocks);
1507
1508         if (!free_blocks) {
1509                 /*
1510                  * Try to commit to see if that will free up some
1511                  * more space.
1512                  */
1513                 r = commit(pool);
1514                 if (r)
1515                         return r;
1516
1517                 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1518                 if (r) {
1519                         metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
1520                         return r;
1521                 }
1522
1523                 if (!free_blocks) {
1524                         set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
1525                         return -ENOSPC;
1526                 }
1527         }
1528
1529         r = dm_pool_alloc_data_block(pool->pmd, result);
1530         if (r) {
1531                 if (r == -ENOSPC)
1532                         set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
1533                 else
1534                         metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
1535                 return r;
1536         }
1537
1538         r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks);
1539         if (r) {
1540                 metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r);
1541                 return r;
1542         }
1543
1544         if (!free_blocks) {
1545                 /* Let's commit before we use up the metadata reserve. */
1546                 r = commit(pool);
1547                 if (r)
1548                         return r;
1549         }
1550
1551         return 0;
1552 }
1553
1554 /*
1555  * If we have run out of space, queue bios until the device is
1556  * resumed, presumably after having been reloaded with more space.
1557  */
1558 static void retry_on_resume(struct bio *bio)
1559 {
1560         struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1561         struct thin_c *tc = h->tc;
1562         unsigned long flags;
1563
1564         spin_lock_irqsave(&tc->lock, flags);
1565         bio_list_add(&tc->retry_on_resume_list, bio);
1566         spin_unlock_irqrestore(&tc->lock, flags);
1567 }
1568
1569 static blk_status_t should_error_unserviceable_bio(struct pool *pool)
1570 {
1571         enum pool_mode m = get_pool_mode(pool);
1572
1573         switch (m) {
1574         case PM_WRITE:
1575                 /* Shouldn't get here */
1576                 DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
1577                 return BLK_STS_IOERR;
1578
1579         case PM_OUT_OF_DATA_SPACE:
1580                 return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
1581
1582         case PM_OUT_OF_METADATA_SPACE:
1583         case PM_READ_ONLY:
1584         case PM_FAIL:
1585                 return BLK_STS_IOERR;
1586         default:
1587                 /* Shouldn't get here */
1588                 DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
1589                 return BLK_STS_IOERR;
1590         }
1591 }
1592
1593 static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
1594 {
1595         blk_status_t error = should_error_unserviceable_bio(pool);
1596
1597         if (error) {
1598                 bio->bi_status = error;
1599                 bio_endio(bio);
1600         } else
1601                 retry_on_resume(bio);
1602 }
1603
1604 static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
1605 {
1606         struct bio *bio;
1607         struct bio_list bios;
1608         blk_status_t error;
1609
1610         error = should_error_unserviceable_bio(pool);
1611         if (error) {
1612                 cell_error_with_code(pool, cell, error);
1613                 return;
1614         }
1615
1616         bio_list_init(&bios);
1617         cell_release(pool, cell, &bios);
1618
1619         while ((bio = bio_list_pop(&bios)))
1620                 retry_on_resume(bio);
1621 }
1622
1623 static void process_discard_cell_no_passdown(struct thin_c *tc,
1624                                              struct dm_bio_prison_cell *virt_cell)
1625 {
1626         struct pool *pool = tc->pool;
1627         struct dm_thin_new_mapping *m = get_next_mapping(pool);
1628
1629         /*
1630          * We don't need to lock the data blocks, since there's no
1631          * passdown.  We only lock data blocks for allocation and breaking sharing.
1632          */
1633         m->tc = tc;
1634         m->virt_begin = virt_cell->key.block_begin;
1635         m->virt_end = virt_cell->key.block_end;
1636         m->cell = virt_cell;
1637         m->bio = virt_cell->holder;
1638
1639         if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
1640                 pool->process_prepared_discard(m);
1641 }
1642
1643 static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
1644                                  struct bio *bio)
1645 {
1646         struct pool *pool = tc->pool;
1647
1648         int r;
1649         bool maybe_shared;
1650         struct dm_cell_key data_key;
1651         struct dm_bio_prison_cell *data_cell;
1652         struct dm_thin_new_mapping *m;
1653         dm_block_t virt_begin, virt_end, data_begin;
1654
1655         while (begin != end) {
1656                 r = ensure_next_mapping(pool);
1657                 if (r)
1658                         /* we did our best */
1659                         return;
1660
1661                 r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
1662                                               &data_begin, &maybe_shared);
1663                 if (r)
1664                         /*
1665                          * Silently fail, letting any mappings we've
1666                          * created complete.
1667                          */
1668                         break;
1669
1670                 build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
1671                 if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
1672                         /* contention, we'll give up with this range */
1673                         begin = virt_end;
1674                         continue;
1675                 }
1676
1677                 /*
1678                  * IO may still be going to the destination block.  We must
1679                  * quiesce before we can do the removal.
1680                  */
1681                 m = get_next_mapping(pool);
1682                 m->tc = tc;
1683                 m->maybe_shared = maybe_shared;
1684                 m->virt_begin = virt_begin;
1685                 m->virt_end = virt_end;
1686                 m->data_block = data_begin;
1687                 m->cell = data_cell;
1688                 m->bio = bio;
1689
1690                 /*
1691                  * The parent bio must not complete before sub discard bios are
1692                  * chained to it (see end_discard's bio_chain)!
1693                  *
1694                  * This per-mapping bi_remaining increment is paired with
1695                  * the implicit decrement that occurs via bio_endio() in
1696                  * end_discard().
1697                  */
1698                 bio_inc_remaining(bio);
1699                 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
1700                         pool->process_prepared_discard(m);
1701
1702                 begin = virt_end;
1703         }
1704 }
1705
1706 static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell)
1707 {
1708         struct bio *bio = virt_cell->holder;
1709         struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1710
1711         /*
1712          * The virt_cell will only get freed once the origin bio completes.
1713          * This means it will remain locked while all the individual
1714          * passdown bios are in flight.
1715          */
1716         h->cell = virt_cell;
1717         break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
1718
1719         /*
1720          * We complete the bio now, knowing that the bi_remaining field
1721          * will prevent completion until the sub range discards have
1722          * completed.
1723          */
1724         bio_endio(bio);
1725 }
1726
1727 static void process_discard_bio(struct thin_c *tc, struct bio *bio)
1728 {
1729         dm_block_t begin, end;
1730         struct dm_cell_key virt_key;
1731         struct dm_bio_prison_cell *virt_cell;
1732
1733         get_bio_block_range(tc, bio, &begin, &end);
1734         if (begin == end) {
1735                 /*
1736                  * The discard covers less than a block.
1737                  */
1738                 bio_endio(bio);
1739                 return;
1740         }
1741
1742         build_key(tc->td, VIRTUAL, begin, end, &virt_key);
1743         if (bio_detain(tc->pool, &virt_key, bio, &virt_cell))
1744                 /*
1745                  * Potential starvation issue: We're relying on the
1746                  * fs/application being well behaved, and not trying to
1747                  * send IO to a region at the same time as discarding it.
1748                  * If they do this persistently then it's possible this
1749                  * cell will never be granted.
1750                  */
1751                 return;
1752
1753         tc->pool->process_discard_cell(tc, virt_cell);
1754 }
1755
1756 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1757                           struct dm_cell_key *key,
1758                           struct dm_thin_lookup_result *lookup_result,
1759                           struct dm_bio_prison_cell *cell)
1760 {
1761         int r;
1762         dm_block_t data_block;
1763         struct pool *pool = tc->pool;
1764
1765         r = alloc_data_block(tc, &data_block);
1766         switch (r) {
1767         case 0:
1768                 schedule_internal_copy(tc, block, lookup_result->block,
1769                                        data_block, cell, bio);
1770                 break;
1771
1772         case -ENOSPC:
1773                 retry_bios_on_resume(pool, cell);
1774                 break;
1775
1776         default:
1777                 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1778                             __func__, r);
1779                 cell_error(pool, cell);
1780                 break;
1781         }
1782 }
1783
1784 static void __remap_and_issue_shared_cell(void *context,
1785                                           struct dm_bio_prison_cell *cell)
1786 {
1787         struct remap_info *info = context;
1788         struct bio *bio;
1789
1790         while ((bio = bio_list_pop(&cell->bios))) {
1791                 if (bio_data_dir(bio) == WRITE || op_is_flush(bio->bi_opf) ||
1792                     bio_op(bio) == REQ_OP_DISCARD)
1793                         bio_list_add(&info->defer_bios, bio);
1794                 else {
1795                         struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1796
1797                         h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);
1798                         inc_all_io_entry(info->tc->pool, bio);
1799                         bio_list_add(&info->issue_bios, bio);
1800                 }
1801         }
1802 }
1803
1804 static void remap_and_issue_shared_cell(struct thin_c *tc,
1805                                         struct dm_bio_prison_cell *cell,
1806                                         dm_block_t block)
1807 {
1808         struct bio *bio;
1809         struct remap_info info;
1810
1811         info.tc = tc;
1812         bio_list_init(&info.defer_bios);
1813         bio_list_init(&info.issue_bios);
1814
1815         cell_visit_release(tc->pool, __remap_and_issue_shared_cell,
1816                            &info, cell);
1817
1818         while ((bio = bio_list_pop(&info.defer_bios)))
1819                 thin_defer_bio(tc, bio);
1820
1821         while ((bio = bio_list_pop(&info.issue_bios)))
1822                 remap_and_issue(tc, bio, block);
1823 }
1824
1825 static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1826                                dm_block_t block,
1827                                struct dm_thin_lookup_result *lookup_result,
1828                                struct dm_bio_prison_cell *virt_cell)
1829 {
1830         struct dm_bio_prison_cell *data_cell;
1831         struct pool *pool = tc->pool;
1832         struct dm_cell_key key;
1833
1834         /*
1835          * If cell is already occupied, then sharing is already in the process
1836          * of being broken so we have nothing further to do here.
1837          */
1838         build_data_key(tc->td, lookup_result->block, &key);
1839         if (bio_detain(pool, &key, bio, &data_cell)) {
1840                 cell_defer_no_holder(tc, virt_cell);
1841                 return;
1842         }
1843
1844         if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
1845                 break_sharing(tc, bio, block, &key, lookup_result, data_cell);
1846                 cell_defer_no_holder(tc, virt_cell);
1847         } else {
1848                 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1849
1850                 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
1851                 inc_all_io_entry(pool, bio);
1852                 remap_and_issue(tc, bio, lookup_result->block);
1853
1854                 remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);
1855                 remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);
1856         }
1857 }
1858
1859 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1860                             struct dm_bio_prison_cell *cell)
1861 {
1862         int r;
1863         dm_block_t data_block;
1864         struct pool *pool = tc->pool;
1865
1866         /*
1867          * Remap empty bios (flushes) immediately, without provisioning.
1868          */
1869         if (!bio->bi_iter.bi_size) {
1870                 inc_all_io_entry(pool, bio);
1871                 cell_defer_no_holder(tc, cell);
1872
1873                 remap_and_issue(tc, bio, 0);
1874                 return;
1875         }
1876
1877         /*
1878          * Fill read bios with zeroes and complete them immediately.
1879          */
1880         if (bio_data_dir(bio) == READ) {
1881                 zero_fill_bio(bio);
1882                 cell_defer_no_holder(tc, cell);
1883                 bio_endio(bio);
1884                 return;
1885         }
1886
1887         r = alloc_data_block(tc, &data_block);
1888         switch (r) {
1889         case 0:
1890                 if (tc->origin_dev)
1891                         schedule_external_copy(tc, block, data_block, cell, bio);
1892                 else
1893                         schedule_zero(tc, block, data_block, cell, bio);
1894                 break;
1895
1896         case -ENOSPC:
1897                 retry_bios_on_resume(pool, cell);
1898                 break;
1899
1900         default:
1901                 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1902                             __func__, r);
1903                 cell_error(pool, cell);
1904                 break;
1905         }
1906 }
1907
1908 static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1909 {
1910         int r;
1911         struct pool *pool = tc->pool;
1912         struct bio *bio = cell->holder;
1913         dm_block_t block = get_bio_block(tc, bio);
1914         struct dm_thin_lookup_result lookup_result;
1915
1916         if (tc->requeue_mode) {
1917                 cell_requeue(pool, cell);
1918                 return;
1919         }
1920
1921         r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1922         switch (r) {
1923         case 0:
1924                 if (lookup_result.shared)
1925                         process_shared_bio(tc, bio, block, &lookup_result, cell);
1926                 else {
1927                         inc_all_io_entry(pool, bio);
1928                         remap_and_issue(tc, bio, lookup_result.block);
1929                         inc_remap_and_issue_cell(tc, cell, lookup_result.block);
1930                 }
1931                 break;
1932
1933         case -ENODATA:
1934                 if (bio_data_dir(bio) == READ && tc->origin_dev) {
1935                         inc_all_io_entry(pool, bio);
1936                         cell_defer_no_holder(tc, cell);
1937
1938                         if (bio_end_sector(bio) <= tc->origin_size)
1939                                 remap_to_origin_and_issue(tc, bio);
1940
1941                         else if (bio->bi_iter.bi_sector < tc->origin_size) {
1942                                 zero_fill_bio(bio);
1943                                 bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
1944                                 remap_to_origin_and_issue(tc, bio);
1945
1946                         } else {
1947                                 zero_fill_bio(bio);
1948                                 bio_endio(bio);
1949                         }
1950                 } else
1951                         provision_block(tc, bio, block, cell);
1952                 break;
1953
1954         default:
1955                 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1956                             __func__, r);
1957                 cell_defer_no_holder(tc, cell);
1958                 bio_io_error(bio);
1959                 break;
1960         }
1961 }
1962
1963 static void process_bio(struct thin_c *tc, struct bio *bio)
1964 {
1965         struct pool *pool = tc->pool;
1966         dm_block_t block = get_bio_block(tc, bio);
1967         struct dm_bio_prison_cell *cell;
1968         struct dm_cell_key key;
1969
1970         /*
1971          * If cell is already occupied, then the block is already
1972          * being provisioned so we have nothing further to do here.
1973          */
1974         build_virtual_key(tc->td, block, &key);
1975         if (bio_detain(pool, &key, bio, &cell))
1976                 return;
1977
1978         process_cell(tc, cell);
1979 }
1980
1981 static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
1982                                     struct dm_bio_prison_cell *cell)
1983 {
1984         int r;
1985         int rw = bio_data_dir(bio);
1986         dm_block_t block = get_bio_block(tc, bio);
1987         struct dm_thin_lookup_result lookup_result;
1988
1989         r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1990         switch (r) {
1991         case 0:
1992                 if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
1993                         handle_unserviceable_bio(tc->pool, bio);
1994                         if (cell)
1995                                 cell_defer_no_holder(tc, cell);
1996                 } else {
1997                         inc_all_io_entry(tc->pool, bio);
1998                         remap_and_issue(tc, bio, lookup_result.block);
1999                         if (cell)
2000                                 inc_remap_and_issue_cell(tc, cell, lookup_result.block);
2001                 }
2002                 break;
2003
2004         case -ENODATA:
2005                 if (cell)
2006                         cell_defer_no_holder(tc, cell);
2007                 if (rw != READ) {
2008                         handle_unserviceable_bio(tc->pool, bio);
2009                         break;
2010                 }
2011
2012                 if (tc->origin_dev) {
2013                         inc_all_io_entry(tc->pool, bio);
2014                         remap_to_origin_and_issue(tc, bio);
2015                         break;
2016                 }
2017
2018                 zero_fill_bio(bio);
2019                 bio_endio(bio);
2020                 break;
2021
2022         default:
2023                 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
2024                             __func__, r);
2025                 if (cell)
2026                         cell_defer_no_holder(tc, cell);
2027                 bio_io_error(bio);
2028                 break;
2029         }
2030 }
2031
2032 static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
2033 {
2034         __process_bio_read_only(tc, bio, NULL);
2035 }
2036
2037 static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2038 {
2039         __process_bio_read_only(tc, cell->holder, cell);
2040 }
2041
2042 static void process_bio_success(struct thin_c *tc, struct bio *bio)
2043 {
2044         bio_endio(bio);
2045 }
2046
2047 static void process_bio_fail(struct thin_c *tc, struct bio *bio)
2048 {
2049         bio_io_error(bio);
2050 }
2051
2052 static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2053 {
2054         cell_success(tc->pool, cell);
2055 }
2056
2057 static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2058 {
2059         cell_error(tc->pool, cell);
2060 }
2061
2062 /*
2063  * FIXME: should we also commit due to size of transaction, measured in
2064  * metadata blocks?
2065  */
2066 static int need_commit_due_to_time(struct pool *pool)
2067 {
2068         return !time_in_range(jiffies, pool->last_commit_jiffies,
2069                               pool->last_commit_jiffies + COMMIT_PERIOD);
2070 }
2071
2072 #define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
2073 #define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
2074
2075 static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)
2076 {
2077         struct rb_node **rbp, *parent;
2078         struct dm_thin_endio_hook *pbd;
2079         sector_t bi_sector = bio->bi_iter.bi_sector;
2080
2081         rbp = &tc->sort_bio_list.rb_node;
2082         parent = NULL;
2083         while (*rbp) {
2084                 parent = *rbp;
2085                 pbd = thin_pbd(parent);
2086
2087                 if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
2088                         rbp = &(*rbp)->rb_left;
2089                 else
2090                         rbp = &(*rbp)->rb_right;
2091         }
2092
2093         pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
2094         rb_link_node(&pbd->rb_node, parent, rbp);
2095         rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
2096 }
2097
2098 static void __extract_sorted_bios(struct thin_c *tc)
2099 {
2100         struct rb_node *node;
2101         struct dm_thin_endio_hook *pbd;
2102         struct bio *bio;
2103
2104         for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
2105                 pbd = thin_pbd(node);
2106                 bio = thin_bio(pbd);
2107
2108                 bio_list_add(&tc->deferred_bio_list, bio);
2109                 rb_erase(&pbd->rb_node, &tc->sort_bio_list);
2110         }
2111
2112         WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
2113 }
2114
2115 static void __sort_thin_deferred_bios(struct thin_c *tc)
2116 {
2117         struct bio *bio;
2118         struct bio_list bios;
2119
2120         bio_list_init(&bios);
2121         bio_list_merge(&bios, &tc->deferred_bio_list);
2122         bio_list_init(&tc->deferred_bio_list);
2123
2124         /* Sort deferred_bio_list using rb-tree */
2125         while ((bio = bio_list_pop(&bios)))
2126                 __thin_bio_rb_add(tc, bio);
2127
2128         /*
2129          * Transfer the sorted bios in sort_bio_list back to
2130          * deferred_bio_list to allow lockless submission of
2131          * all bios.
2132          */
2133         __extract_sorted_bios(tc);
2134 }
2135
2136 static void process_thin_deferred_bios(struct thin_c *tc)
2137 {
2138         struct pool *pool = tc->pool;
2139         unsigned long flags;
2140         struct bio *bio;
2141         struct bio_list bios;
2142         struct blk_plug plug;
2143         unsigned count = 0;
2144
2145         if (tc->requeue_mode) {
2146                 error_thin_bio_list(tc, &tc->deferred_bio_list,
2147                                 BLK_STS_DM_REQUEUE);
2148                 return;
2149         }
2150
2151         bio_list_init(&bios);
2152
2153         spin_lock_irqsave(&tc->lock, flags);
2154
2155         if (bio_list_empty(&tc->deferred_bio_list)) {
2156                 spin_unlock_irqrestore(&tc->lock, flags);
2157                 return;
2158         }
2159
2160         __sort_thin_deferred_bios(tc);
2161
2162         bio_list_merge(&bios, &tc->deferred_bio_list);
2163         bio_list_init(&tc->deferred_bio_list);
2164
2165         spin_unlock_irqrestore(&tc->lock, flags);
2166
2167         blk_start_plug(&plug);
2168         while ((bio = bio_list_pop(&bios))) {
2169                 /*
2170                  * If we've got no free new_mapping structs, and processing
2171                  * this bio might require one, we pause until there are some
2172                  * prepared mappings to process.
2173                  */
2174                 if (ensure_next_mapping(pool)) {
2175                         spin_lock_irqsave(&tc->lock, flags);
2176                         bio_list_add(&tc->deferred_bio_list, bio);
2177                         bio_list_merge(&tc->deferred_bio_list, &bios);
2178                         spin_unlock_irqrestore(&tc->lock, flags);
2179                         break;
2180                 }
2181
2182                 if (bio_op(bio) == REQ_OP_DISCARD)
2183                         pool->process_discard(tc, bio);
2184                 else
2185                         pool->process_bio(tc, bio);
2186
2187                 if ((count++ & 127) == 0) {
2188                         throttle_work_update(&pool->throttle);
2189                         dm_pool_issue_prefetches(pool->pmd);
2190                 }
2191         }
2192         blk_finish_plug(&plug);
2193 }
2194
2195 static int cmp_cells(const void *lhs, const void *rhs)
2196 {
2197         struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs);
2198         struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs);
2199
2200         BUG_ON(!lhs_cell->holder);
2201         BUG_ON(!rhs_cell->holder);
2202
2203         if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)
2204                 return -1;
2205
2206         if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)
2207                 return 1;
2208
2209         return 0;
2210 }
2211
2212 static unsigned sort_cells(struct pool *pool, struct list_head *cells)
2213 {
2214         unsigned count = 0;
2215         struct dm_bio_prison_cell *cell, *tmp;
2216
2217         list_for_each_entry_safe(cell, tmp, cells, user_list) {
2218                 if (count >= CELL_SORT_ARRAY_SIZE)
2219                         break;
2220
2221                 pool->cell_sort_array[count++] = cell;
2222                 list_del(&cell->user_list);
2223         }
2224
2225         sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);
2226
2227         return count;
2228 }
2229
2230 static void process_thin_deferred_cells(struct thin_c *tc)
2231 {
2232         struct pool *pool = tc->pool;
2233         unsigned long flags;
2234         struct list_head cells;
2235         struct dm_bio_prison_cell *cell;
2236         unsigned i, j, count;
2237
2238         INIT_LIST_HEAD(&cells);
2239
2240         spin_lock_irqsave(&tc->lock, flags);
2241         list_splice_init(&tc->deferred_cells, &cells);
2242         spin_unlock_irqrestore(&tc->lock, flags);
2243
2244         if (list_empty(&cells))
2245                 return;
2246
2247         do {
2248                 count = sort_cells(tc->pool, &cells);
2249
2250                 for (i = 0; i < count; i++) {
2251                         cell = pool->cell_sort_array[i];
2252                         BUG_ON(!cell->holder);
2253
2254                         /*
2255                          * If we've got no free new_mapping structs, and processing
2256                          * this bio might require one, we pause until there are some
2257                          * prepared mappings to process.
2258                          */
2259                         if (ensure_next_mapping(pool)) {
2260                                 for (j = i; j < count; j++)
2261                                         list_add(&pool->cell_sort_array[j]->user_list, &cells);
2262
2263                                 spin_lock_irqsave(&tc->lock, flags);
2264                                 list_splice(&cells, &tc->deferred_cells);
2265                                 spin_unlock_irqrestore(&tc->lock, flags);
2266                                 return;
2267                         }
2268
2269                         if (bio_op(cell->holder) == REQ_OP_DISCARD)
2270                                 pool->process_discard_cell(tc, cell);
2271                         else
2272                                 pool->process_cell(tc, cell);
2273                 }
2274         } while (!list_empty(&cells));
2275 }
2276
2277 static void thin_get(struct thin_c *tc);
2278 static void thin_put(struct thin_c *tc);
2279
2280 /*
2281  * We can't hold rcu_read_lock() around code that can block.  So we
2282  * find a thin with the rcu lock held; bump a refcount; then drop
2283  * the lock.
2284  */
2285 static struct thin_c *get_first_thin(struct pool *pool)
2286 {
2287         struct thin_c *tc = NULL;
2288
2289         rcu_read_lock();
2290         if (!list_empty(&pool->active_thins)) {
2291                 tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
2292                 thin_get(tc);
2293         }
2294         rcu_read_unlock();
2295
2296         return tc;
2297 }
2298
2299 static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
2300 {
2301         struct thin_c *old_tc = tc;
2302
2303         rcu_read_lock();
2304         list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
2305                 thin_get(tc);
2306                 thin_put(old_tc);
2307                 rcu_read_unlock();
2308                 return tc;
2309         }
2310         thin_put(old_tc);
2311         rcu_read_unlock();
2312
2313         return NULL;
2314 }
2315
2316 static void process_deferred_bios(struct pool *pool)
2317 {
2318         unsigned long flags;
2319         struct bio *bio;
2320         struct bio_list bios;
2321         struct thin_c *tc;
2322
2323         tc = get_first_thin(pool);
2324         while (tc) {
2325                 process_thin_deferred_cells(tc);
2326                 process_thin_deferred_bios(tc);
2327                 tc = get_next_thin(pool, tc);
2328         }
2329
2330         /*
2331          * If there are any deferred flush bios, we must commit
2332          * the metadata before issuing them.
2333          */
2334         bio_list_init(&bios);
2335         spin_lock_irqsave(&pool->lock, flags);
2336         bio_list_merge(&bios, &pool->deferred_flush_bios);
2337         bio_list_init(&pool->deferred_flush_bios);
2338         spin_unlock_irqrestore(&pool->lock, flags);
2339
2340         if (bio_list_empty(&bios) &&
2341             !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
2342                 return;
2343
2344         if (commit(pool)) {
2345                 while ((bio = bio_list_pop(&bios)))
2346                         bio_io_error(bio);
2347                 return;
2348         }
2349         pool->last_commit_jiffies = jiffies;
2350
2351         while ((bio = bio_list_pop(&bios)))
2352                 generic_make_request(bio);
2353 }
2354
2355 static void do_worker(struct work_struct *ws)
2356 {
2357         struct pool *pool = container_of(ws, struct pool, worker);
2358
2359         throttle_work_start(&pool->throttle);
2360         dm_pool_issue_prefetches(pool->pmd);
2361         throttle_work_update(&pool->throttle);
2362         process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
2363         throttle_work_update(&pool->throttle);
2364         process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
2365         throttle_work_update(&pool->throttle);
2366         process_prepared(pool, &pool->prepared_discards_pt2, &pool->process_prepared_discard_pt2);
2367         throttle_work_update(&pool->throttle);
2368         process_deferred_bios(pool);
2369         throttle_work_complete(&pool->throttle);
2370 }
2371
2372 /*
2373  * We want to commit periodically so that not too much
2374  * unwritten data builds up.
2375  */
2376 static void do_waker(struct work_struct *ws)
2377 {
2378         struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
2379         wake_worker(pool);
2380         queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
2381 }
2382
2383 /*
2384  * We're holding onto IO to allow userland time to react.  After the
2385  * timeout either the pool will have been resized (and thus back in
2386  * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
2387  */
2388 static void do_no_space_timeout(struct work_struct *ws)
2389 {
2390         struct pool *pool = container_of(to_delayed_work(ws), struct pool,
2391                                          no_space_timeout);
2392
2393         if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
2394                 pool->pf.error_if_no_space = true;
2395                 notify_of_pool_mode_change(pool);
2396                 error_retry_list_with_code(pool, BLK_STS_NOSPC);
2397         }
2398 }
2399
2400 /*----------------------------------------------------------------*/
2401
2402 struct pool_work {
2403         struct work_struct worker;
2404         struct completion complete;
2405 };
2406
2407 static struct pool_work *to_pool_work(struct work_struct *ws)
2408 {
2409         return container_of(ws, struct pool_work, worker);
2410 }
2411
2412 static void pool_work_complete(struct pool_work *pw)
2413 {
2414         complete(&pw->complete);
2415 }
2416
2417 static void pool_work_wait(struct pool_work *pw, struct pool *pool,
2418                            void (*fn)(struct work_struct *))
2419 {
2420         INIT_WORK_ONSTACK(&pw->worker, fn);
2421         init_completion(&pw->complete);
2422         queue_work(pool->wq, &pw->worker);
2423         wait_for_completion(&pw->complete);
2424 }
2425
2426 /*----------------------------------------------------------------*/
2427
2428 struct noflush_work {
2429         struct pool_work pw;
2430         struct thin_c *tc;
2431 };
2432
2433 static struct noflush_work *to_noflush(struct work_struct *ws)
2434 {
2435         return container_of(to_pool_work(ws), struct noflush_work, pw);
2436 }
2437
2438 static void do_noflush_start(struct work_struct *ws)
2439 {
2440         struct noflush_work *w = to_noflush(ws);
2441         w->tc->requeue_mode = true;
2442         requeue_io(w->tc);
2443         pool_work_complete(&w->pw);
2444 }
2445
2446 static void do_noflush_stop(struct work_struct *ws)
2447 {
2448         struct noflush_work *w = to_noflush(ws);
2449         w->tc->requeue_mode = false;
2450         pool_work_complete(&w->pw);
2451 }
2452
2453 static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
2454 {
2455         struct noflush_work w;
2456
2457         w.tc = tc;
2458         pool_work_wait(&w.pw, tc->pool, fn);
2459 }
2460
2461 /*----------------------------------------------------------------*/
2462
2463 static bool passdown_enabled(struct pool_c *pt)
2464 {
2465         return pt->adjusted_pf.discard_passdown;
2466 }
2467
2468 static void set_discard_callbacks(struct pool *pool)
2469 {
2470         struct pool_c *pt = pool->ti->private;
2471
2472         if (passdown_enabled(pt)) {
2473                 pool->process_discard_cell = process_discard_cell_passdown;
2474                 pool->process_prepared_discard = process_prepared_discard_passdown_pt1;
2475                 pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2;
2476         } else {
2477                 pool->process_discard_cell = process_discard_cell_no_passdown;
2478                 pool->process_prepared_discard = process_prepared_discard_no_passdown;
2479         }
2480 }
2481
2482 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
2483 {
2484         struct pool_c *pt = pool->ti->private;
2485         bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
2486         enum pool_mode old_mode = get_pool_mode(pool);
2487         unsigned long no_space_timeout = READ_ONCE(no_space_timeout_secs) * HZ;
2488
2489         /*
2490          * Never allow the pool to transition to PM_WRITE mode if user
2491          * intervention is required to verify metadata and data consistency.
2492          */
2493         if (new_mode == PM_WRITE && needs_check) {
2494                 DMERR("%s: unable to switch pool to write mode until repaired.",
2495                       dm_device_name(pool->pool_md));
2496                 if (old_mode != new_mode)
2497                         new_mode = old_mode;
2498                 else
2499                         new_mode = PM_READ_ONLY;
2500         }
2501         /*
2502          * If we were in PM_FAIL mode, rollback of metadata failed.  We're
2503          * not going to recover without a thin_repair.  So we never let the
2504          * pool move out of the old mode.
2505          */
2506         if (old_mode == PM_FAIL)
2507                 new_mode = old_mode;
2508
2509         switch (new_mode) {
2510         case PM_FAIL:
2511                 dm_pool_metadata_read_only(pool->pmd);
2512                 pool->process_bio = process_bio_fail;
2513                 pool->process_discard = process_bio_fail;
2514                 pool->process_cell = process_cell_fail;
2515                 pool->process_discard_cell = process_cell_fail;
2516                 pool->process_prepared_mapping = process_prepared_mapping_fail;
2517                 pool->process_prepared_discard = process_prepared_discard_fail;
2518
2519                 error_retry_list(pool);
2520                 break;
2521
2522         case PM_OUT_OF_METADATA_SPACE:
2523         case PM_READ_ONLY:
2524                 dm_pool_metadata_read_only(pool->pmd);
2525                 pool->process_bio = process_bio_read_only;
2526                 pool->process_discard = process_bio_success;
2527                 pool->process_cell = process_cell_read_only;
2528                 pool->process_discard_cell = process_cell_success;
2529                 pool->process_prepared_mapping = process_prepared_mapping_fail;
2530                 pool->process_prepared_discard = process_prepared_discard_success;
2531
2532                 error_retry_list(pool);
2533                 break;
2534
2535         case PM_OUT_OF_DATA_SPACE:
2536                 /*
2537                  * Ideally we'd never hit this state; the low water mark
2538                  * would trigger userland to extend the pool before we
2539                  * completely run out of data space.  However, many small
2540                  * IOs to unprovisioned space can consume data space at an
2541                  * alarming rate.  Adjust your low water mark if you're
2542                  * frequently seeing this mode.
2543                  */
2544                 pool->out_of_data_space = true;
2545                 pool->process_bio = process_bio_read_only;
2546                 pool->process_discard = process_discard_bio;
2547                 pool->process_cell = process_cell_read_only;
2548                 pool->process_prepared_mapping = process_prepared_mapping;
2549                 set_discard_callbacks(pool);
2550
2551                 if (!pool->pf.error_if_no_space && no_space_timeout)
2552                         queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
2553                 break;
2554
2555         case PM_WRITE:
2556                 if (old_mode == PM_OUT_OF_DATA_SPACE)
2557                         cancel_delayed_work_sync(&pool->no_space_timeout);
2558                 pool->out_of_data_space = false;
2559                 pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
2560                 dm_pool_metadata_read_write(pool->pmd);
2561                 pool->process_bio = process_bio;
2562                 pool->process_discard = process_discard_bio;
2563                 pool->process_cell = process_cell;
2564                 pool->process_prepared_mapping = process_prepared_mapping;
2565                 set_discard_callbacks(pool);
2566                 break;
2567         }
2568
2569         pool->pf.mode = new_mode;
2570         /*
2571          * The pool mode may have changed, sync it so bind_control_target()
2572          * doesn't cause an unexpected mode transition on resume.
2573          */
2574         pt->adjusted_pf.mode = new_mode;
2575
2576         if (old_mode != new_mode)
2577                 notify_of_pool_mode_change(pool);
2578 }
2579
2580 static void abort_transaction(struct pool *pool)
2581 {
2582         const char *dev_name = dm_device_name(pool->pool_md);
2583
2584         DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
2585         if (dm_pool_abort_metadata(pool->pmd)) {
2586                 DMERR("%s: failed to abort metadata transaction", dev_name);
2587                 set_pool_mode(pool, PM_FAIL);
2588         }
2589
2590         if (dm_pool_metadata_set_needs_check(pool->pmd)) {
2591                 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
2592                 set_pool_mode(pool, PM_FAIL);
2593         }
2594 }
2595
2596 static void metadata_operation_failed(struct pool *pool, const char *op, int r)
2597 {
2598         DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
2599                     dm_device_name(pool->pool_md), op, r);
2600
2601         abort_transaction(pool);
2602         set_pool_mode(pool, PM_READ_ONLY);
2603 }
2604
2605 /*----------------------------------------------------------------*/
2606
2607 /*
2608  * Mapping functions.
2609  */
2610
2611 /*
2612  * Called only while mapping a thin bio to hand it over to the workqueue.
2613  */
2614 static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
2615 {
2616         unsigned long flags;
2617         struct pool *pool = tc->pool;
2618
2619         spin_lock_irqsave(&tc->lock, flags);
2620         bio_list_add(&tc->deferred_bio_list, bio);
2621         spin_unlock_irqrestore(&tc->lock, flags);
2622
2623         wake_worker(pool);
2624 }
2625
2626 static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
2627 {
2628         struct pool *pool = tc->pool;
2629
2630         throttle_lock(&pool->throttle);
2631         thin_defer_bio(tc, bio);
2632         throttle_unlock(&pool->throttle);
2633 }
2634
2635 static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2636 {
2637         unsigned long flags;
2638         struct pool *pool = tc->pool;
2639
2640         throttle_lock(&pool->throttle);
2641         spin_lock_irqsave(&tc->lock, flags);
2642         list_add_tail(&cell->user_list, &tc->deferred_cells);
2643         spin_unlock_irqrestore(&tc->lock, flags);
2644         throttle_unlock(&pool->throttle);
2645
2646         wake_worker(pool);
2647 }
2648
2649 static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
2650 {
2651         struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
2652
2653         h->tc = tc;
2654         h->shared_read_entry = NULL;
2655         h->all_io_entry = NULL;
2656         h->overwrite_mapping = NULL;
2657         h->cell = NULL;
2658 }
2659
2660 /*
2661  * Non-blocking function called from the thin target's map function.
2662  */
2663 static int thin_bio_map(struct dm_target *ti, struct bio *bio)
2664 {
2665         int r;
2666         struct thin_c *tc = ti->private;
2667         dm_block_t block = get_bio_block(tc, bio);
2668         struct dm_thin_device *td = tc->td;
2669         struct dm_thin_lookup_result result;
2670         struct dm_bio_prison_cell *virt_cell, *data_cell;
2671         struct dm_cell_key key;
2672
2673         thin_hook_bio(tc, bio);
2674
2675         if (tc->requeue_mode) {
2676                 bio->bi_status = BLK_STS_DM_REQUEUE;
2677                 bio_endio(bio);
2678                 return DM_MAPIO_SUBMITTED;
2679         }
2680
2681         if (get_pool_mode(tc->pool) == PM_FAIL) {
2682                 bio_io_error(bio);
2683                 return DM_MAPIO_SUBMITTED;
2684         }
2685
2686         if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) {
2687                 thin_defer_bio_with_throttle(tc, bio);
2688                 return DM_MAPIO_SUBMITTED;
2689         }
2690
2691         /*
2692          * We must hold the virtual cell before doing the lookup, otherwise
2693          * there's a race with discard.
2694          */
2695         build_virtual_key(tc->td, block, &key);
2696         if (bio_detain(tc->pool, &key, bio, &virt_cell))
2697                 return DM_MAPIO_SUBMITTED;
2698
2699         r = dm_thin_find_block(td, block, 0, &result);
2700
2701         /*
2702          * Note that we defer readahead too.
2703          */
2704         switch (r) {
2705         case 0:
2706                 if (unlikely(result.shared)) {
2707                         /*
2708                          * We have a race condition here between the
2709                          * result.shared value returned by the lookup and
2710                          * snapshot creation, which may cause new
2711                          * sharing.
2712                          *
2713                          * To avoid this always quiesce the origin before
2714                          * taking the snap.  You want to do this anyway to
2715                          * ensure a consistent application view
2716                          * (i.e. lockfs).
2717                          *
2718                          * More distant ancestors are irrelevant. The
2719                          * shared flag will be set in their case.
2720                          */
2721                         thin_defer_cell(tc, virt_cell);
2722                         return DM_MAPIO_SUBMITTED;
2723                 }
2724
2725                 build_data_key(tc->td, result.block, &key);
2726                 if (bio_detain(tc->pool, &key, bio, &data_cell)) {
2727                         cell_defer_no_holder(tc, virt_cell);
2728                         return DM_MAPIO_SUBMITTED;
2729                 }
2730
2731                 inc_all_io_entry(tc->pool, bio);
2732                 cell_defer_no_holder(tc, data_cell);
2733                 cell_defer_no_holder(tc, virt_cell);
2734
2735                 remap(tc, bio, result.block);
2736                 return DM_MAPIO_REMAPPED;
2737
2738         case -ENODATA:
2739         case -EWOULDBLOCK:
2740                 thin_defer_cell(tc, virt_cell);
2741                 return DM_MAPIO_SUBMITTED;
2742
2743         default:
2744                 /*
2745                  * Must always call bio_io_error on failure.
2746                  * dm_thin_find_block can fail with -EINVAL if the
2747                  * pool is switched to fail-io mode.
2748                  */
2749                 bio_io_error(bio);
2750                 cell_defer_no_holder(tc, virt_cell);
2751                 return DM_MAPIO_SUBMITTED;
2752         }
2753 }
2754
2755 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
2756 {
2757         struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
2758         struct request_queue *q;
2759
2760         if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE)
2761                 return 1;
2762
2763         q = bdev_get_queue(pt->data_dev->bdev);
2764         return bdi_congested(q->backing_dev_info, bdi_bits);
2765 }
2766
2767 static void requeue_bios(struct pool *pool)
2768 {
2769         unsigned long flags;
2770         struct thin_c *tc;
2771
2772         rcu_read_lock();
2773         list_for_each_entry_rcu(tc, &pool->active_thins, list) {
2774                 spin_lock_irqsave(&tc->lock, flags);
2775                 bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
2776                 bio_list_init(&tc->retry_on_resume_list);
2777                 spin_unlock_irqrestore(&tc->lock, flags);
2778         }
2779         rcu_read_unlock();
2780 }
2781
2782 /*----------------------------------------------------------------
2783  * Binding of control targets to a pool object
2784  *--------------------------------------------------------------*/
2785 static bool data_dev_supports_discard(struct pool_c *pt)
2786 {
2787         struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2788
2789         return q && blk_queue_discard(q);
2790 }
2791
2792 static bool is_factor(sector_t block_size, uint32_t n)
2793 {
2794         return !sector_div(block_size, n);
2795 }
2796
2797 /*
2798  * If discard_passdown was enabled verify that the data device
2799  * supports discards.  Disable discard_passdown if not.
2800  */
2801 static void disable_passdown_if_not_supported(struct pool_c *pt)
2802 {
2803         struct pool *pool = pt->pool;
2804         struct block_device *data_bdev = pt->data_dev->bdev;
2805         struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
2806         const char *reason = NULL;
2807         char buf[BDEVNAME_SIZE];
2808
2809         if (!pt->adjusted_pf.discard_passdown)
2810                 return;
2811
2812         if (!data_dev_supports_discard(pt))
2813                 reason = "discard unsupported";
2814
2815         else if (data_limits->max_discard_sectors < pool->sectors_per_block)
2816                 reason = "max discard sectors smaller than a block";
2817
2818         if (reason) {
2819                 DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
2820                 pt->adjusted_pf.discard_passdown = false;
2821         }
2822 }
2823
2824 static int bind_control_target(struct pool *pool, struct dm_target *ti)
2825 {
2826         struct pool_c *pt = ti->private;
2827
2828         /*
2829          * We want to make sure that a pool in PM_FAIL mode is never upgraded.
2830          */
2831         enum pool_mode old_mode = get_pool_mode(pool);
2832         enum pool_mode new_mode = pt->adjusted_pf.mode;
2833
2834         /*
2835          * Don't change the pool's mode until set_pool_mode() below.
2836          * Otherwise the pool's process_* function pointers may
2837          * not match the desired pool mode.
2838          */
2839         pt->adjusted_pf.mode = old_mode;
2840
2841         pool->ti = ti;
2842         pool->pf = pt->adjusted_pf;
2843         pool->low_water_blocks = pt->low_water_blocks;
2844
2845         set_pool_mode(pool, new_mode);
2846
2847         return 0;
2848 }
2849
2850 static void unbind_control_target(struct pool *pool, struct dm_target *ti)
2851 {
2852         if (pool->ti == ti)
2853                 pool->ti = NULL;
2854 }
2855
2856 /*----------------------------------------------------------------
2857  * Pool creation
2858  *--------------------------------------------------------------*/
2859 /* Initialize pool features. */
2860 static void pool_features_init(struct pool_features *pf)
2861 {
2862         pf->mode = PM_WRITE;
2863         pf->zero_new_blocks = true;
2864         pf->discard_enabled = true;
2865         pf->discard_passdown = true;
2866         pf->error_if_no_space = false;
2867 }
2868
2869 static void __pool_destroy(struct pool *pool)
2870 {
2871         __pool_table_remove(pool);
2872
2873         vfree(pool->cell_sort_array);
2874         if (dm_pool_metadata_close(pool->pmd) < 0)
2875                 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2876
2877         dm_bio_prison_destroy(pool->prison);
2878         dm_kcopyd_client_destroy(pool->copier);
2879
2880         if (pool->wq)
2881                 destroy_workqueue(pool->wq);
2882
2883         if (pool->next_mapping)
2884                 mempool_free(pool->next_mapping, &pool->mapping_pool);
2885         mempool_exit(&pool->mapping_pool);
2886         dm_deferred_set_destroy(pool->shared_read_ds);
2887         dm_deferred_set_destroy(pool->all_io_ds);
2888         kfree(pool);
2889 }
2890
2891 static struct kmem_cache *_new_mapping_cache;
2892
2893 static struct pool *pool_create(struct mapped_device *pool_md,
2894                                 struct block_device *metadata_dev,
2895                                 unsigned long block_size,
2896                                 int read_only, char **error)
2897 {
2898         int r;
2899         void *err_p;
2900         struct pool *pool;
2901         struct dm_pool_metadata *pmd;
2902         bool format_device = read_only ? false : true;
2903
2904         pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
2905         if (IS_ERR(pmd)) {
2906                 *error = "Error creating metadata object";
2907                 return (struct pool *)pmd;
2908         }
2909
2910         pool = kzalloc(sizeof(*pool), GFP_KERNEL);
2911         if (!pool) {
2912                 *error = "Error allocating memory for pool";
2913                 err_p = ERR_PTR(-ENOMEM);
2914                 goto bad_pool;
2915         }
2916
2917         pool->pmd = pmd;
2918         pool->sectors_per_block = block_size;
2919         if (block_size & (block_size - 1))
2920                 pool->sectors_per_block_shift = -1;
2921         else
2922                 pool->sectors_per_block_shift = __ffs(block_size);
2923         pool->low_water_blocks = 0;
2924         pool_features_init(&pool->pf);
2925         pool->prison = dm_bio_prison_create();
2926         if (!pool->prison) {
2927                 *error = "Error creating pool's bio prison";
2928                 err_p = ERR_PTR(-ENOMEM);
2929                 goto bad_prison;
2930         }
2931
2932         pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2933         if (IS_ERR(pool->copier)) {
2934                 r = PTR_ERR(pool->copier);
2935                 *error = "Error creating pool's kcopyd client";
2936                 err_p = ERR_PTR(r);
2937                 goto bad_kcopyd_client;
2938         }
2939
2940         /*
2941          * Create singlethreaded workqueue that will service all devices
2942          * that use this metadata.
2943          */
2944         pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2945         if (!pool->wq) {
2946                 *error = "Error creating pool's workqueue";
2947                 err_p = ERR_PTR(-ENOMEM);
2948                 goto bad_wq;
2949         }
2950
2951         throttle_init(&pool->throttle);
2952         INIT_WORK(&pool->worker, do_worker);
2953         INIT_DELAYED_WORK(&pool->waker, do_waker);
2954         INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
2955         spin_lock_init(&pool->lock);
2956         bio_list_init(&pool->deferred_flush_bios);
2957         INIT_LIST_HEAD(&pool->prepared_mappings);
2958         INIT_LIST_HEAD(&pool->prepared_discards);
2959         INIT_LIST_HEAD(&pool->prepared_discards_pt2);
2960         INIT_LIST_HEAD(&pool->active_thins);
2961         pool->low_water_triggered = false;
2962         pool->suspended = true;
2963         pool->out_of_data_space = false;
2964
2965         pool->shared_read_ds = dm_deferred_set_create();
2966         if (!pool->shared_read_ds) {
2967                 *error = "Error creating pool's shared read deferred set";
2968                 err_p = ERR_PTR(-ENOMEM);
2969                 goto bad_shared_read_ds;
2970         }
2971
2972         pool->all_io_ds = dm_deferred_set_create();
2973         if (!pool->all_io_ds) {
2974                 *error = "Error creating pool's all io deferred set";
2975                 err_p = ERR_PTR(-ENOMEM);
2976                 goto bad_all_io_ds;
2977         }
2978
2979         pool->next_mapping = NULL;
2980         r = mempool_init_slab_pool(&pool->mapping_pool, MAPPING_POOL_SIZE,
2981                                    _new_mapping_cache);
2982         if (r) {
2983                 *error = "Error creating pool's mapping mempool";
2984                 err_p = ERR_PTR(r);
2985                 goto bad_mapping_pool;
2986         }
2987
2988         pool->cell_sort_array =
2989                 vmalloc(array_size(CELL_SORT_ARRAY_SIZE,
2990                                    sizeof(*pool->cell_sort_array)));
2991         if (!pool->cell_sort_array) {
2992                 *error = "Error allocating cell sort array";
2993                 err_p = ERR_PTR(-ENOMEM);
2994                 goto bad_sort_array;
2995         }
2996
2997         pool->ref_count = 1;
2998         pool->last_commit_jiffies = jiffies;
2999         pool->pool_md = pool_md;
3000         pool->md_dev = metadata_dev;
3001         __pool_table_insert(pool);
3002
3003         return pool;
3004
3005 bad_sort_array:
3006         mempool_exit(&pool->mapping_pool);
3007 bad_mapping_pool:
3008         dm_deferred_set_destroy(pool->all_io_ds);
3009 bad_all_io_ds:
3010         dm_deferred_set_destroy(pool->shared_read_ds);
3011 bad_shared_read_ds:
3012         destroy_workqueue(pool->wq);
3013 bad_wq:
3014         dm_kcopyd_client_destroy(pool->copier);
3015 bad_kcopyd_client:
3016         dm_bio_prison_destroy(pool->prison);
3017 bad_prison:
3018         kfree(pool);
3019 bad_pool:
3020         if (dm_pool_metadata_close(pmd))
3021                 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
3022
3023         return err_p;
3024 }
3025
3026 static void __pool_inc(struct pool *pool)
3027 {
3028         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
3029         pool->ref_count++;
3030 }
3031
3032 static void __pool_dec(struct pool *pool)
3033 {
3034         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
3035         BUG_ON(!pool->ref_count);
3036         if (!--pool->ref_count)
3037                 __pool_destroy(pool);
3038 }
3039
3040 static struct pool *__pool_find(struct mapped_device *pool_md,
3041                                 struct block_device *metadata_dev,
3042                                 unsigned long block_size, int read_only,
3043                                 char **error, int *created)
3044 {
3045         struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
3046
3047         if (pool) {
3048                 if (pool->pool_md != pool_md) {
3049                         *error = "metadata device already in use by a pool";
3050                         return ERR_PTR(-EBUSY);
3051                 }
3052                 __pool_inc(pool);
3053
3054         } else {
3055                 pool = __pool_table_lookup(pool_md);
3056                 if (pool) {
3057                         if (pool->md_dev != metadata_dev) {
3058                                 *error = "different pool cannot replace a pool";
3059                                 return ERR_PTR(-EINVAL);
3060                         }
3061                         __pool_inc(pool);
3062
3063                 } else {
3064                         pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
3065                         *created = 1;
3066                 }
3067         }
3068
3069         return pool;
3070 }
3071
3072 /*----------------------------------------------------------------
3073  * Pool target methods
3074  *--------------------------------------------------------------*/
3075 static void pool_dtr(struct dm_target *ti)
3076 {
3077         struct pool_c *pt = ti->private;
3078
3079         mutex_lock(&dm_thin_pool_table.mutex);
3080
3081         unbind_control_target(pt->pool, ti);
3082         __pool_dec(pt->pool);
3083         dm_put_device(ti, pt->metadata_dev);
3084         dm_put_device(ti, pt->data_dev);
3085         kfree(pt);
3086
3087         mutex_unlock(&dm_thin_pool_table.mutex);
3088 }
3089
3090 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
3091                                struct dm_target *ti)
3092 {
3093         int r;
3094         unsigned argc;
3095         const char *arg_name;
3096
3097         static const struct dm_arg _args[] = {
3098                 {0, 4, "Invalid number of pool feature arguments"},
3099         };
3100
3101         /*
3102          * No feature arguments supplied.
3103          */
3104         if (!as->argc)
3105                 return 0;
3106
3107         r = dm_read_arg_group(_args, as, &argc, &ti->error);
3108         if (r)
3109                 return -EINVAL;
3110
3111         while (argc && !r) {
3112                 arg_name = dm_shift_arg(as);
3113                 argc--;
3114
3115                 if (!strcasecmp(arg_name, "skip_block_zeroing"))
3116                         pf->zero_new_blocks = false;
3117
3118                 else if (!strcasecmp(arg_name, "ignore_discard"))
3119                         pf->discard_enabled = false;
3120
3121                 else if (!strcasecmp(arg_name, "no_discard_passdown"))
3122                         pf->discard_passdown = false;
3123
3124                 else if (!strcasecmp(arg_name, "read_only"))
3125                         pf->mode = PM_READ_ONLY;
3126
3127                 else if (!strcasecmp(arg_name, "error_if_no_space"))
3128                         pf->error_if_no_space = true;
3129
3130                 else {
3131                         ti->error = "Unrecognised pool feature requested";
3132                         r = -EINVAL;
3133                         break;
3134                 }
3135         }
3136
3137         return r;
3138 }
3139
3140 static void metadata_low_callback(void *context)
3141 {
3142         struct pool *pool = context;
3143
3144         DMWARN("%s: reached low water mark for metadata device: sending event.",
3145                dm_device_name(pool->pool_md));
3146
3147         dm_table_event(pool->ti->table);
3148 }
3149
3150 static sector_t get_dev_size(struct block_device *bdev)
3151 {
3152         return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
3153 }
3154
3155 static void warn_if_metadata_device_too_big(struct block_device *bdev)
3156 {
3157         sector_t metadata_dev_size = get_dev_size(bdev);
3158         char buffer[BDEVNAME_SIZE];
3159
3160         if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
3161                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
3162                        bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
3163 }
3164
3165 static sector_t get_metadata_dev_size(struct block_device *bdev)
3166 {
3167         sector_t metadata_dev_size = get_dev_size(bdev);
3168
3169         if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
3170                 metadata_dev_size = THIN_METADATA_MAX_SECTORS;
3171
3172         return metadata_dev_size;
3173 }
3174
3175 static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
3176 {
3177         sector_t metadata_dev_size = get_metadata_dev_size(bdev);
3178
3179         sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
3180
3181         return metadata_dev_size;
3182 }
3183
3184 /*
3185  * When a metadata threshold is crossed a dm event is triggered, and
3186  * userland should respond by growing the metadata device.  We could let
3187  * userland set the threshold, like we do with the data threshold, but I'm
3188  * not sure they know enough to do this well.
3189  */
3190 static dm_block_t calc_metadata_threshold(struct pool_c *pt)
3191 {
3192         /*
3193          * 4M is ample for all ops with the possible exception of thin
3194          * device deletion which is harmless if it fails (just retry the
3195          * delete after you've grown the device).
3196          */
3197         dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
3198         return min((dm_block_t)1024ULL /* 4M */, quarter);
3199 }
3200
3201 /*
3202  * thin-pool <metadata dev> <data dev>
3203  *           <data block size (sectors)>
3204  *           <low water mark (blocks)>
3205  *           [<#feature args> [<arg>]*]
3206  *
3207  * Optional feature arguments are:
3208  *           skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
3209  *           ignore_discard: disable discard
3210  *           no_discard_passdown: don't pass discards down to the data device
3211  *           read_only: Don't allow any changes to be made to the pool metadata.
3212  *           error_if_no_space: error IOs, instead of queueing, if no space.
3213  */
3214 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
3215 {
3216         int r, pool_created = 0;
3217         struct pool_c *pt;
3218         struct pool *pool;
3219         struct pool_features pf;
3220         struct dm_arg_set as;
3221         struct dm_dev *data_dev;
3222         unsigned long block_size;
3223         dm_block_t low_water_blocks;
3224         struct dm_dev *metadata_dev;
3225         fmode_t metadata_mode;
3226
3227         /*
3228          * FIXME Remove validation from scope of lock.
3229          */
3230         mutex_lock(&dm_thin_pool_table.mutex);
3231
3232         if (argc < 4) {
3233                 ti->error = "Invalid argument count";
3234                 r = -EINVAL;
3235                 goto out_unlock;
3236         }
3237
3238         as.argc = argc;
3239         as.argv = argv;
3240
3241         /*
3242          * Set default pool features.
3243          */
3244         pool_features_init(&pf);
3245
3246         dm_consume_args(&as, 4);
3247         r = parse_pool_features(&as, &pf, ti);
3248         if (r)
3249                 goto out_unlock;
3250
3251         metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);
3252         r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
3253         if (r) {
3254                 ti->error = "Error opening metadata block device";
3255                 goto out_unlock;
3256         }
3257         warn_if_metadata_device_too_big(metadata_dev->bdev);
3258
3259         r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
3260         if (r) {
3261                 ti->error = "Error getting data device";
3262                 goto out_metadata;
3263         }
3264
3265         if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
3266             block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
3267             block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
3268             block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
3269                 ti->error = "Invalid block size";
3270                 r = -EINVAL;
3271                 goto out;
3272         }
3273
3274         if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
3275                 ti->error = "Invalid low water mark";
3276                 r = -EINVAL;
3277                 goto out;
3278         }
3279
3280         pt = kzalloc(sizeof(*pt), GFP_KERNEL);
3281         if (!pt) {
3282                 r = -ENOMEM;
3283                 goto out;
3284         }
3285
3286         pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
3287                            block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
3288         if (IS_ERR(pool)) {
3289                 r = PTR_ERR(pool);
3290                 goto out_free_pt;
3291         }
3292
3293         /*
3294          * 'pool_created' reflects whether this is the first table load.
3295          * Top level discard support is not allowed to be changed after
3296          * initial load.  This would require a pool reload to trigger thin
3297          * device changes.
3298          */
3299         if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
3300                 ti->error = "Discard support cannot be disabled once enabled";
3301                 r = -EINVAL;
3302                 goto out_flags_changed;
3303         }
3304
3305         pt->pool = pool;
3306         pt->ti = ti;
3307         pt->metadata_dev = metadata_dev;
3308         pt->data_dev = data_dev;
3309         pt->low_water_blocks = low_water_blocks;
3310         pt->adjusted_pf = pt->requested_pf = pf;
3311         ti->num_flush_bios = 1;
3312
3313         /*
3314          * Only need to enable discards if the pool should pass
3315          * them down to the data device.  The thin device's discard
3316          * processing will cause mappings to be removed from the btree.
3317          */
3318         if (pf.discard_enabled && pf.discard_passdown) {
3319                 ti->num_discard_bios = 1;
3320
3321                 /*
3322                  * Setting 'discards_supported' circumvents the normal
3323                  * stacking of discard limits (this keeps the pool and
3324                  * thin devices' discard limits consistent).
3325                  */
3326                 ti->discards_supported = true;
3327         }
3328         ti->private = pt;
3329
3330         r = dm_pool_register_metadata_threshold(pt->pool->pmd,
3331                                                 calc_metadata_threshold(pt),
3332                                                 metadata_low_callback,
3333                                                 pool);
3334         if (r)
3335                 goto out_flags_changed;
3336
3337         pt->callbacks.congested_fn = pool_is_congested;
3338         dm_table_add_target_callbacks(ti->table, &pt->callbacks);
3339
3340         mutex_unlock(&dm_thin_pool_table.mutex);
3341
3342         return 0;
3343
3344 out_flags_changed:
3345         __pool_dec(pool);
3346 out_free_pt:
3347         kfree(pt);
3348 out:
3349         dm_put_device(ti, data_dev);
3350 out_metadata:
3351         dm_put_device(ti, metadata_dev);
3352 out_unlock:
3353         mutex_unlock(&dm_thin_pool_table.mutex);
3354
3355         return r;
3356 }
3357
3358 static int pool_map(struct dm_target *ti, struct bio *bio)
3359 {
3360         int r;
3361         struct pool_c *pt = ti->private;
3362         struct pool *pool = pt->pool;
3363         unsigned long flags;
3364
3365         /*
3366          * As this is a singleton target, ti->begin is always zero.
3367          */
3368         spin_lock_irqsave(&pool->lock, flags);
3369         bio_set_dev(bio, pt->data_dev->bdev);
3370         r = DM_MAPIO_REMAPPED;
3371         spin_unlock_irqrestore(&pool->lock, flags);
3372
3373         return r;
3374 }
3375
3376 static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
3377 {
3378         int r;
3379         struct pool_c *pt = ti->private;
3380         struct pool *pool = pt->pool;
3381         sector_t data_size = ti->len;
3382         dm_block_t sb_data_size;
3383
3384         *need_commit = false;
3385
3386         (void) sector_div(data_size, pool->sectors_per_block);
3387
3388         r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
3389         if (r) {
3390                 DMERR("%s: failed to retrieve data device size",
3391                       dm_device_name(pool->pool_md));
3392                 return r;
3393         }
3394
3395         if (data_size < sb_data_size) {
3396                 DMERR("%s: pool target (%llu blocks) too small: expected %llu",
3397                       dm_device_name(pool->pool_md),
3398                       (unsigned long long)data_size, sb_data_size);
3399                 return -EINVAL;
3400
3401         } else if (data_size > sb_data_size) {
3402                 if (dm_pool_metadata_needs_check(pool->pmd)) {
3403                         DMERR("%s: unable to grow the data device until repaired.",
3404                               dm_device_name(pool->pool_md));
3405                         return 0;
3406                 }
3407
3408                 if (sb_data_size)
3409                         DMINFO("%s: growing the data device from %llu to %llu blocks",
3410                                dm_device_name(pool->pool_md),
3411                                sb_data_size, (unsigned long long)data_size);
3412                 r = dm_pool_resize_data_dev(pool->pmd, data_size);
3413                 if (r) {
3414                         metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
3415                         return r;
3416                 }
3417
3418                 *need_commit = true;
3419         }
3420
3421         return 0;
3422 }
3423
3424 static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
3425 {
3426         int r;
3427         struct pool_c *pt = ti->private;
3428         struct pool *pool = pt->pool;
3429         dm_block_t metadata_dev_size, sb_metadata_dev_size;
3430
3431         *need_commit = false;
3432
3433         metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);
3434
3435         r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
3436         if (r) {
3437                 DMERR("%s: failed to retrieve metadata device size",
3438                       dm_device_name(pool->pool_md));
3439                 return r;
3440         }
3441
3442         if (metadata_dev_size < sb_metadata_dev_size) {
3443                 DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
3444                       dm_device_name(pool->pool_md),
3445                       metadata_dev_size, sb_metadata_dev_size);
3446                 return -EINVAL;
3447
3448         } else if (metadata_dev_size > sb_metadata_dev_size) {
3449                 if (dm_pool_metadata_needs_check(pool->pmd)) {
3450                         DMERR("%s: unable to grow the metadata device until repaired.",
3451                               dm_device_name(pool->pool_md));
3452                         return 0;
3453                 }
3454
3455                 warn_if_metadata_device_too_big(pool->md_dev);
3456                 DMINFO("%s: growing the metadata device from %llu to %llu blocks",
3457                        dm_device_name(pool->pool_md),
3458                        sb_metadata_dev_size, metadata_dev_size);
3459
3460                 if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
3461                         set_pool_mode(pool, PM_WRITE);
3462
3463                 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
3464                 if (r) {
3465                         metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
3466                         return r;
3467                 }
3468
3469                 *need_commit = true;
3470         }
3471
3472         return 0;
3473 }
3474
3475 /*
3476  * Retrieves the number of blocks of the data device from
3477  * the superblock and compares it to the actual device size,
3478  * thus resizing the data device in case it has grown.
3479  *
3480  * This both copes with opening preallocated data devices in the ctr
3481  * being followed by a resume
3482  * -and-
3483  * calling the resume method individually after userspace has
3484  * grown the data device in reaction to a table event.
3485  */
3486 static int pool_preresume(struct dm_target *ti)
3487 {
3488         int r;
3489         bool need_commit1, need_commit2;
3490         struct pool_c *pt = ti->private;
3491         struct pool *pool = pt->pool;
3492
3493         /*
3494          * Take control of the pool object.
3495          */
3496         r = bind_control_target(pool, ti);
3497         if (r)
3498                 return r;
3499
3500         r = maybe_resize_data_dev(ti, &need_commit1);
3501         if (r)
3502                 return r;
3503
3504         r = maybe_resize_metadata_dev(ti, &need_commit2);
3505         if (r)
3506                 return r;
3507
3508         if (need_commit1 || need_commit2)
3509                 (void) commit(pool);
3510
3511         return 0;
3512 }
3513
3514 static void pool_suspend_active_thins(struct pool *pool)
3515 {
3516         struct thin_c *tc;
3517
3518         /* Suspend all active thin devices */
3519         tc = get_first_thin(pool);
3520         while (tc) {
3521                 dm_internal_suspend_noflush(tc->thin_md);
3522                 tc = get_next_thin(pool, tc);
3523         }
3524 }
3525
3526 static void pool_resume_active_thins(struct pool *pool)
3527 {
3528         struct thin_c *tc;
3529
3530         /* Resume all active thin devices */
3531         tc = get_first_thin(pool);
3532         while (tc) {
3533                 dm_internal_resume(tc->thin_md);
3534                 tc = get_next_thin(pool, tc);
3535         }
3536 }
3537
3538 static void pool_resume(struct dm_target *ti)
3539 {
3540         struct pool_c *pt = ti->private;
3541         struct pool *pool = pt->pool;
3542         unsigned long flags;
3543
3544         /*
3545          * Must requeue active_thins' bios and then resume
3546          * active_thins _before_ clearing 'suspend' flag.
3547          */
3548         requeue_bios(pool);
3549         pool_resume_active_thins(pool);
3550
3551         spin_lock_irqsave(&pool->lock, flags);
3552         pool->low_water_triggered = false;
3553         pool->suspended = false;
3554         spin_unlock_irqrestore(&pool->lock, flags);
3555
3556         do_waker(&pool->waker.work);
3557 }
3558
3559 static void pool_presuspend(struct dm_target *ti)
3560 {
3561         struct pool_c *pt = ti->private;
3562         struct pool *pool = pt->pool;
3563         unsigned long flags;
3564
3565         spin_lock_irqsave(&pool->lock, flags);
3566         pool->suspended = true;
3567         spin_unlock_irqrestore(&pool->lock, flags);
3568
3569         pool_suspend_active_thins(pool);
3570 }
3571
3572 static void pool_presuspend_undo(struct dm_target *ti)
3573 {
3574         struct pool_c *pt = ti->private;
3575         struct pool *pool = pt->pool;
3576         unsigned long flags;
3577
3578         pool_resume_active_thins(pool);
3579
3580         spin_lock_irqsave(&pool->lock, flags);
3581         pool->suspended = false;
3582         spin_unlock_irqrestore(&pool->lock, flags);
3583 }
3584
3585 static void pool_postsuspend(struct dm_target *ti)
3586 {
3587         struct pool_c *pt = ti->private;
3588         struct pool *pool = pt->pool;
3589
3590         cancel_delayed_work_sync(&pool->waker);
3591         cancel_delayed_work_sync(&pool->no_space_timeout);
3592         flush_workqueue(pool->wq);
3593         (void) commit(pool);
3594 }
3595
3596 static int check_arg_count(unsigned argc, unsigned args_required)
3597 {
3598         if (argc != args_required) {
3599                 DMWARN("Message received with %u arguments instead of %u.",
3600                        argc, args_required);
3601                 return -EINVAL;
3602         }
3603
3604         return 0;
3605 }
3606
3607 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
3608 {
3609         if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
3610             *dev_id <= MAX_DEV_ID)
3611                 return 0;
3612
3613         if (warning)
3614                 DMWARN("Message received with invalid device id: %s", arg);
3615
3616         return -EINVAL;
3617 }
3618
3619 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
3620 {
3621         dm_thin_id dev_id;
3622         int r;
3623
3624         r = check_arg_count(argc, 2);
3625         if (r)
3626                 return r;
3627
3628         r = read_dev_id(argv[1], &dev_id, 1);
3629         if (r)
3630                 return r;
3631
3632         r = dm_pool_create_thin(pool->pmd, dev_id);
3633         if (r) {
3634                 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
3635                        argv[1]);
3636                 return r;
3637         }
3638
3639         return 0;
3640 }
3641
3642 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
3643 {
3644         dm_thin_id dev_id;
3645         dm_thin_id origin_dev_id;
3646         int r;
3647
3648         r = check_arg_count(argc, 3);
3649         if (r)
3650                 return r;
3651
3652         r = read_dev_id(argv[1], &dev_id, 1);
3653         if (r)
3654                 return r;
3655
3656         r = read_dev_id(argv[2], &origin_dev_id, 1);
3657         if (r)
3658                 return r;
3659
3660         r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
3661         if (r) {
3662                 DMWARN("Creation of new snapshot %s of device %s failed.",
3663                        argv[1], argv[2]);
3664                 return r;
3665         }
3666
3667         return 0;
3668 }
3669
3670 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
3671 {
3672         dm_thin_id dev_id;
3673         int r;
3674
3675         r = check_arg_count(argc, 2);
3676         if (r)
3677                 return r;
3678
3679         r = read_dev_id(argv[1], &dev_id, 1);
3680         if (r)
3681                 return r;
3682
3683         r = dm_pool_delete_thin_device(pool->pmd, dev_id);
3684         if (r)
3685                 DMWARN("Deletion of thin device %s failed.", argv[1]);
3686
3687         return r;
3688 }
3689
3690 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
3691 {
3692         dm_thin_id old_id, new_id;
3693         int r;
3694
3695         r = check_arg_count(argc, 3);
3696         if (r)
3697                 return r;
3698
3699         if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
3700                 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
3701                 return -EINVAL;
3702         }
3703
3704         if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
3705                 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
3706                 return -EINVAL;
3707         }
3708
3709         r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
3710         if (r) {
3711                 DMWARN("Failed to change transaction id from %s to %s.",
3712                        argv[1], argv[2]);
3713                 return r;
3714         }
3715
3716         return 0;
3717 }
3718
3719 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
3720 {
3721         int r;
3722
3723         r = check_arg_count(argc, 1);
3724         if (r)
3725                 return r;
3726
3727         (void) commit(pool);
3728
3729         r = dm_pool_reserve_metadata_snap(pool->pmd);
3730         if (r)
3731                 DMWARN("reserve_metadata_snap message failed.");
3732
3733         return r;
3734 }
3735
3736 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
3737 {
3738         int r;
3739
3740         r = check_arg_count(argc, 1);
3741         if (r)
3742                 return r;
3743
3744         r = dm_pool_release_metadata_snap(pool->pmd);
3745         if (r)
3746                 DMWARN("release_metadata_snap message failed.");
3747
3748         return r;
3749 }
3750
3751 /*
3752  * Messages supported:
3753  *   create_thin        <dev_id>
3754  *   create_snap        <dev_id> <origin_id>
3755  *   delete             <dev_id>
3756  *   set_transaction_id <current_trans_id> <new_trans_id>
3757  *   reserve_metadata_snap
3758  *   release_metadata_snap
3759  */
3760 static int pool_message(struct dm_target *ti, unsigned argc, char **argv,
3761                         char *result, unsigned maxlen)
3762 {
3763         int r = -EINVAL;
3764         struct pool_c *pt = ti->private;
3765         struct pool *pool = pt->pool;
3766
3767         if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
3768                 DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
3769                       dm_device_name(pool->pool_md));
3770                 return -EOPNOTSUPP;
3771         }
3772
3773         if (!strcasecmp(argv[0], "create_thin"))
3774                 r = process_create_thin_mesg(argc, argv, pool);
3775
3776         else if (!strcasecmp(argv[0], "create_snap"))
3777                 r = process_create_snap_mesg(argc, argv, pool);
3778
3779         else if (!strcasecmp(argv[0], "delete"))
3780                 r = process_delete_mesg(argc, argv, pool);
3781
3782         else if (!strcasecmp(argv[0], "set_transaction_id"))
3783                 r = process_set_transaction_id_mesg(argc, argv, pool);
3784
3785         else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
3786                 r = process_reserve_metadata_snap_mesg(argc, argv, pool);
3787
3788         else if (!strcasecmp(argv[0], "release_metadata_snap"))
3789                 r = process_release_metadata_snap_mesg(argc, argv, pool);
3790
3791         else
3792                 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
3793
3794         if (!r)
3795                 (void) commit(pool);
3796
3797         return r;
3798 }
3799
3800 static void emit_flags(struct pool_features *pf, char *result,
3801                        unsigned sz, unsigned maxlen)
3802 {
3803         unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
3804                 !pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
3805                 pf->error_if_no_space;
3806         DMEMIT("%u ", count);
3807
3808         if (!pf->zero_new_blocks)
3809                 DMEMIT("skip_block_zeroing ");
3810
3811         if (!pf->discard_enabled)
3812                 DMEMIT("ignore_discard ");
3813
3814         if (!pf->discard_passdown)
3815                 DMEMIT("no_discard_passdown ");
3816
3817         if (pf->mode == PM_READ_ONLY)
3818                 DMEMIT("read_only ");
3819
3820         if (pf->error_if_no_space)
3821                 DMEMIT("error_if_no_space ");
3822 }
3823
3824 /*
3825  * Status line is:
3826  *    <transaction id> <used metadata sectors>/<total metadata sectors>
3827  *    <used data sectors>/<total data sectors> <held metadata root>
3828  *    <pool mode> <discard config> <no space config> <needs_check>
3829  */
3830 static void pool_status(struct dm_target *ti, status_type_t type,
3831                         unsigned status_flags, char *result, unsigned maxlen)
3832 {
3833         int r;
3834         unsigned sz = 0;
3835         uint64_t transaction_id;
3836         dm_block_t nr_free_blocks_data;
3837         dm_block_t nr_free_blocks_metadata;
3838         dm_block_t nr_blocks_data;
3839         dm_block_t nr_blocks_metadata;
3840         dm_block_t held_root;
3841         enum pool_mode mode;
3842         char buf[BDEVNAME_SIZE];
3843         char buf2[BDEVNAME_SIZE];
3844         struct pool_c *pt = ti->private;
3845         struct pool *pool = pt->pool;
3846
3847         switch (type) {
3848         case STATUSTYPE_INFO:
3849                 if (get_pool_mode(pool) == PM_FAIL) {
3850                         DMEMIT("Fail");
3851                         break;
3852                 }
3853
3854                 /* Commit to ensure statistics aren't out-of-date */
3855                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
3856                         (void) commit(pool);
3857
3858                 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
3859                 if (r) {
3860                         DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
3861                               dm_device_name(pool->pool_md), r);
3862                         goto err;
3863                 }
3864
3865                 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
3866                 if (r) {
3867                         DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
3868                               dm_device_name(pool->pool_md), r);
3869                         goto err;
3870                 }
3871
3872                 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
3873                 if (r) {
3874                         DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
3875                               dm_device_name(pool->pool_md), r);
3876                         goto err;
3877                 }
3878
3879                 r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
3880                 if (r) {
3881                         DMERR("%s: dm_pool_get_free_block_count returned %d",
3882                               dm_device_name(pool->pool_md), r);
3883                         goto err;
3884                 }
3885
3886                 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
3887                 if (r) {
3888                         DMERR("%s: dm_pool_get_data_dev_size returned %d",
3889                               dm_device_name(pool->pool_md), r);
3890                         goto err;
3891                 }
3892
3893                 r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
3894                 if (r) {
3895                         DMERR("%s: dm_pool_get_metadata_snap returned %d",
3896                               dm_device_name(pool->pool_md), r);
3897                         goto err;
3898                 }
3899
3900                 DMEMIT("%llu %llu/%llu %llu/%llu ",
3901                        (unsigned long long)transaction_id,
3902                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3903                        (unsigned long long)nr_blocks_metadata,
3904                        (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
3905                        (unsigned long long)nr_blocks_data);
3906
3907                 if (held_root)
3908                         DMEMIT("%llu ", held_root);
3909                 else
3910                         DMEMIT("- ");
3911
3912                 mode = get_pool_mode(pool);
3913                 if (mode == PM_OUT_OF_DATA_SPACE)
3914                         DMEMIT("out_of_data_space ");
3915                 else if (is_read_only_pool_mode(mode))
3916                         DMEMIT("ro ");
3917                 else
3918                         DMEMIT("rw ");
3919
3920                 if (!pool->pf.discard_enabled)
3921                         DMEMIT("ignore_discard ");
3922                 else if (pool->pf.discard_passdown)
3923                         DMEMIT("discard_passdown ");
3924                 else
3925                         DMEMIT("no_discard_passdown ");
3926
3927                 if (pool->pf.error_if_no_space)
3928                         DMEMIT("error_if_no_space ");
3929                 else
3930                         DMEMIT("queue_if_no_space ");
3931
3932                 if (dm_pool_metadata_needs_check(pool->pmd))
3933                         DMEMIT("needs_check ");
3934                 else
3935                         DMEMIT("- ");
3936
3937                 DMEMIT("%llu ", (unsigned long long)calc_metadata_threshold(pt));
3938
3939                 break;
3940
3941         case STATUSTYPE_TABLE:
3942                 DMEMIT("%s %s %lu %llu ",
3943                        format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
3944                        format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
3945                        (unsigned long)pool->sectors_per_block,
3946                        (unsigned long long)pt->low_water_blocks);
3947                 emit_flags(&pt->requested_pf, result, sz, maxlen);
3948                 break;
3949         }
3950         return;
3951
3952 err:
3953         DMEMIT("Error");
3954 }
3955
3956 static int pool_iterate_devices(struct dm_target *ti,
3957                                 iterate_devices_callout_fn fn, void *data)
3958 {
3959         struct pool_c *pt = ti->private;
3960
3961         return fn(ti, pt->data_dev, 0, ti->len, data);
3962 }
3963
3964 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
3965 {
3966         struct pool_c *pt = ti->private;
3967         struct pool *pool = pt->pool;
3968         sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3969
3970         /*
3971          * If max_sectors is smaller than pool->sectors_per_block adjust it
3972          * to the highest possible power-of-2 factor of pool->sectors_per_block.
3973          * This is especially beneficial when the pool's data device is a RAID
3974          * device that has a full stripe width that matches pool->sectors_per_block
3975          * -- because even though partial RAID stripe-sized IOs will be issued to a
3976          *    single RAID stripe; when aggregated they will end on a full RAID stripe
3977          *    boundary.. which avoids additional partial RAID stripe writes cascading
3978          */
3979         if (limits->max_sectors < pool->sectors_per_block) {
3980                 while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {
3981                         if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
3982                                 limits->max_sectors--;
3983                         limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
3984                 }
3985         }
3986
3987         /*
3988          * If the system-determined stacked limits are compatible with the
3989          * pool's blocksize (io_opt is a factor) do not override them.
3990          */
3991         if (io_opt_sectors < pool->sectors_per_block ||
3992             !is_factor(io_opt_sectors, pool->sectors_per_block)) {
3993                 if (is_factor(pool->sectors_per_block, limits->max_sectors))
3994                         blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);
3995                 else
3996                         blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
3997                 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
3998         }
3999
4000         /*
4001          * pt->adjusted_pf is a staging area for the actual features to use.
4002          * They get transferred to the live pool in bind_control_target()
4003          * called from pool_preresume().
4004          */
4005         if (!pt->adjusted_pf.discard_enabled) {
4006                 /*
4007                  * Must explicitly disallow stacking discard limits otherwise the
4008                  * block layer will stack them if pool's data device has support.
4009                  * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the
4010                  * user to see that, so make sure to set all discard limits to 0.
4011                  */
4012                 limits->discard_granularity = 0;
4013                 return;
4014         }
4015
4016         disable_passdown_if_not_supported(pt);
4017
4018         /*
4019          * The pool uses the same discard limits as the underlying data
4020          * device.  DM core has already set this up.
4021          */
4022 }
4023
4024 static struct target_type pool_target = {
4025         .name = "thin-pool",
4026         .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
4027                     DM_TARGET_IMMUTABLE,
4028         .version = {1, 21, 0},
4029         .module = THIS_MODULE,
4030         .ctr = pool_ctr,
4031         .dtr = pool_dtr,
4032         .map = pool_map,
4033         .presuspend = pool_presuspend,
4034         .presuspend_undo = pool_presuspend_undo,
4035         .postsuspend = pool_postsuspend,
4036         .preresume = pool_preresume,
4037         .resume = pool_resume,
4038         .message = pool_message,
4039         .status = pool_status,
4040         .iterate_devices = pool_iterate_devices,
4041         .io_hints = pool_io_hints,
4042 };
4043
4044 /*----------------------------------------------------------------
4045  * Thin target methods
4046  *--------------------------------------------------------------*/
4047 static void thin_get(struct thin_c *tc)
4048 {
4049         refcount_inc(&tc->refcount);
4050 }
4051
4052 static void thin_put(struct thin_c *tc)
4053 {
4054         if (refcount_dec_and_test(&tc->refcount))
4055                 complete(&tc->can_destroy);
4056 }
4057
4058 static void thin_dtr(struct dm_target *ti)
4059 {
4060         struct thin_c *tc = ti->private;
4061         unsigned long flags;
4062
4063         spin_lock_irqsave(&tc->pool->lock, flags);
4064         list_del_rcu(&tc->list);
4065         spin_unlock_irqrestore(&tc->pool->lock, flags);
4066         synchronize_rcu();
4067
4068         thin_put(tc);
4069         wait_for_completion(&tc->can_destroy);
4070
4071         mutex_lock(&dm_thin_pool_table.mutex);
4072
4073         __pool_dec(tc->pool);
4074         dm_pool_close_thin_device(tc->td);
4075         dm_put_device(ti, tc->pool_dev);
4076         if (tc->origin_dev)
4077                 dm_put_device(ti, tc->origin_dev);
4078         kfree(tc);
4079
4080         mutex_unlock(&dm_thin_pool_table.mutex);
4081 }
4082
4083 /*
4084  * Thin target parameters:
4085  *
4086  * <pool_dev> <dev_id> [origin_dev]
4087  *
4088  * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
4089  * dev_id: the internal device identifier
4090  * origin_dev: a device external to the pool that should act as the origin
4091  *
4092  * If the pool device has discards disabled, they get disabled for the thin
4093  * device as well.
4094  */
4095 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
4096 {
4097         int r;
4098         struct thin_c *tc;
4099         struct dm_dev *pool_dev, *origin_dev;
4100         struct mapped_device *pool_md;
4101         unsigned long flags;
4102
4103         mutex_lock(&dm_thin_pool_table.mutex);
4104
4105         if (argc != 2 && argc != 3) {
4106                 ti->error = "Invalid argument count";
4107                 r = -EINVAL;
4108                 goto out_unlock;
4109         }
4110
4111         tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
4112         if (!tc) {
4113                 ti->error = "Out of memory";
4114                 r = -ENOMEM;
4115                 goto out_unlock;
4116         }
4117         tc->thin_md = dm_table_get_md(ti->table);
4118         spin_lock_init(&tc->lock);
4119         INIT_LIST_HEAD(&tc->deferred_cells);
4120         bio_list_init(&tc->deferred_bio_list);
4121         bio_list_init(&tc->retry_on_resume_list);
4122         tc->sort_bio_list = RB_ROOT;
4123
4124         if (argc == 3) {
4125                 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
4126                 if (r) {
4127                         ti->error = "Error opening origin device";
4128                         goto bad_origin_dev;
4129                 }
4130                 tc->origin_dev = origin_dev;
4131         }
4132
4133         r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
4134         if (r) {
4135                 ti->error = "Error opening pool device";
4136                 goto bad_pool_dev;
4137         }
4138         tc->pool_dev = pool_dev;
4139
4140         if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
4141                 ti->error = "Invalid device id";
4142                 r = -EINVAL;
4143                 goto bad_common;
4144         }
4145
4146         pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
4147         if (!pool_md) {
4148                 ti->error = "Couldn't get pool mapped device";
4149                 r = -EINVAL;
4150                 goto bad_common;
4151         }
4152
4153         tc->pool = __pool_table_lookup(pool_md);
4154         if (!tc->pool) {
4155                 ti->error = "Couldn't find pool object";
4156                 r = -EINVAL;
4157                 goto bad_pool_lookup;
4158         }
4159         __pool_inc(tc->pool);
4160
4161         if (get_pool_mode(tc->pool) == PM_FAIL) {
4162                 ti->error = "Couldn't open thin device, Pool is in fail mode";
4163                 r = -EINVAL;
4164                 goto bad_pool;
4165         }
4166
4167         r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
4168         if (r) {
4169                 ti->error = "Couldn't open thin internal device";
4170                 goto bad_pool;
4171         }
4172
4173         r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
4174         if (r)
4175                 goto bad;
4176
4177         ti->num_flush_bios = 1;
4178         ti->flush_supported = true;
4179         ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
4180
4181         /* In case the pool supports discards, pass them on. */
4182         if (tc->pool->pf.discard_enabled) {
4183                 ti->discards_supported = true;
4184                 ti->num_discard_bios = 1;
4185                 ti->split_discard_bios = false;
4186         }
4187
4188         mutex_unlock(&dm_thin_pool_table.mutex);
4189
4190         spin_lock_irqsave(&tc->pool->lock, flags);
4191         if (tc->pool->suspended) {
4192                 spin_unlock_irqrestore(&tc->pool->lock, flags);
4193                 mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */
4194                 ti->error = "Unable to activate thin device while pool is suspended";
4195                 r = -EINVAL;
4196                 goto bad;
4197         }
4198         refcount_set(&tc->refcount, 1);
4199         init_completion(&tc->can_destroy);
4200         list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
4201         spin_unlock_irqrestore(&tc->pool->lock, flags);
4202         /*
4203          * This synchronize_rcu() call is needed here otherwise we risk a
4204          * wake_worker() call finding no bios to process (because the newly
4205          * added tc isn't yet visible).  So this reduces latency since we
4206          * aren't then dependent on the periodic commit to wake_worker().
4207          */
4208         synchronize_rcu();
4209
4210         dm_put(pool_md);
4211
4212         return 0;
4213
4214 bad:
4215         dm_pool_close_thin_device(tc->td);
4216 bad_pool:
4217         __pool_dec(tc->pool);
4218 bad_pool_lookup:
4219         dm_put(pool_md);
4220 bad_common:
4221         dm_put_device(ti, tc->pool_dev);
4222 bad_pool_dev:
4223         if (tc->origin_dev)
4224                 dm_put_device(ti, tc->origin_dev);
4225 bad_origin_dev:
4226         kfree(tc);
4227 out_unlock:
4228         mutex_unlock(&dm_thin_pool_table.mutex);
4229
4230         return r;
4231 }
4232
4233 static int thin_map(struct dm_target *ti, struct bio *bio)
4234 {
4235         bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
4236
4237         return thin_bio_map(ti, bio);
4238 }
4239
4240 static int thin_endio(struct dm_target *ti, struct bio *bio,
4241                 blk_status_t *err)
4242 {
4243         unsigned long flags;
4244         struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
4245         struct list_head work;
4246         struct dm_thin_new_mapping *m, *tmp;
4247         struct pool *pool = h->tc->pool;
4248
4249         if (h->shared_read_entry) {
4250                 INIT_LIST_HEAD(&work);
4251                 dm_deferred_entry_dec(h->shared_read_entry, &work);
4252
4253                 spin_lock_irqsave(&pool->lock, flags);
4254                 list_for_each_entry_safe(m, tmp, &work, list) {
4255                         list_del(&m->list);
4256                         __complete_mapping_preparation(m);
4257                 }
4258                 spin_unlock_irqrestore(&pool->lock, flags);
4259         }
4260
4261         if (h->all_io_entry) {
4262                 INIT_LIST_HEAD(&work);
4263                 dm_deferred_entry_dec(h->all_io_entry, &work);
4264                 if (!list_empty(&work)) {
4265                         spin_lock_irqsave(&pool->lock, flags);
4266                         list_for_each_entry_safe(m, tmp, &work, list)
4267                                 list_add_tail(&m->list, &pool->prepared_discards);
4268                         spin_unlock_irqrestore(&pool->lock, flags);
4269                         wake_worker(pool);
4270                 }
4271         }
4272
4273         if (h->cell)
4274                 cell_defer_no_holder(h->tc, h->cell);
4275
4276         return DM_ENDIO_DONE;
4277 }
4278
4279 static void thin_presuspend(struct dm_target *ti)
4280 {
4281         struct thin_c *tc = ti->private;
4282
4283         if (dm_noflush_suspending(ti))
4284                 noflush_work(tc, do_noflush_start);
4285 }
4286
4287 static void thin_postsuspend(struct dm_target *ti)
4288 {
4289         struct thin_c *tc = ti->private;
4290
4291         /*
4292          * The dm_noflush_suspending flag has been cleared by now, so
4293          * unfortunately we must always run this.
4294          */
4295         noflush_work(tc, do_noflush_stop);
4296 }
4297
4298 static int thin_preresume(struct dm_target *ti)
4299 {
4300         struct thin_c *tc = ti->private;
4301
4302         if (tc->origin_dev)
4303                 tc->origin_size = get_dev_size(tc->origin_dev->bdev);
4304
4305         return 0;
4306 }
4307
4308 /*
4309  * <nr mapped sectors> <highest mapped sector>
4310  */
4311 static void thin_status(struct dm_target *ti, status_type_t type,
4312                         unsigned status_flags, char *result, unsigned maxlen)
4313 {
4314         int r;
4315         ssize_t sz = 0;
4316         dm_block_t mapped, highest;
4317         char buf[BDEVNAME_SIZE];
4318         struct thin_c *tc = ti->private;
4319
4320         if (get_pool_mode(tc->pool) == PM_FAIL) {
4321                 DMEMIT("Fail");
4322                 return;
4323         }
4324
4325         if (!tc->td)
4326                 DMEMIT("-");
4327         else {
4328                 switch (type) {
4329                 case STATUSTYPE_INFO:
4330                         r = dm_thin_get_mapped_count(tc->td, &mapped);
4331                         if (r) {
4332                                 DMERR("dm_thin_get_mapped_count returned %d", r);
4333                                 goto err;
4334                         }
4335
4336                         r = dm_thin_get_highest_mapped_block(tc->td, &highest);
4337                         if (r < 0) {
4338                                 DMERR("dm_thin_get_highest_mapped_block returned %d", r);
4339                                 goto err;
4340                         }
4341
4342                         DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
4343                         if (r)
4344                                 DMEMIT("%llu", ((highest + 1) *
4345                                                 tc->pool->sectors_per_block) - 1);
4346                         else
4347                                 DMEMIT("-");
4348                         break;
4349
4350                 case STATUSTYPE_TABLE:
4351                         DMEMIT("%s %lu",
4352                                format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
4353                                (unsigned long) tc->dev_id);
4354                         if (tc->origin_dev)
4355                                 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
4356                         break;
4357                 }
4358         }
4359
4360         return;
4361
4362 err:
4363         DMEMIT("Error");
4364 }
4365
4366 static int thin_iterate_devices(struct dm_target *ti,
4367                                 iterate_devices_callout_fn fn, void *data)
4368 {
4369         sector_t blocks;
4370         struct thin_c *tc = ti->private;
4371         struct pool *pool = tc->pool;
4372
4373         /*
4374          * We can't call dm_pool_get_data_dev_size() since that blocks.  So
4375          * we follow a more convoluted path through to the pool's target.
4376          */
4377         if (!pool->ti)
4378                 return 0;       /* nothing is bound */
4379
4380         blocks = pool->ti->len;
4381         (void) sector_div(blocks, pool->sectors_per_block);
4382         if (blocks)
4383                 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
4384
4385         return 0;
4386 }
4387
4388 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
4389 {
4390         struct thin_c *tc = ti->private;
4391         struct pool *pool = tc->pool;
4392
4393         if (!pool->pf.discard_enabled)
4394                 return;
4395
4396         limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
4397         limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
4398 }
4399
4400 static struct target_type thin_target = {
4401         .name = "thin",
4402         .version = {1, 21, 0},
4403         .module = THIS_MODULE,
4404         .ctr = thin_ctr,
4405         .dtr = thin_dtr,
4406         .map = thin_map,
4407         .end_io = thin_endio,
4408         .preresume = thin_preresume,
4409         .presuspend = thin_presuspend,
4410         .postsuspend = thin_postsuspend,
4411         .status = thin_status,
4412         .iterate_devices = thin_iterate_devices,
4413         .io_hints = thin_io_hints,
4414 };
4415
4416 /*----------------------------------------------------------------*/
4417
4418 static int __init dm_thin_init(void)
4419 {
4420         int r = -ENOMEM;
4421
4422         pool_table_init();
4423
4424         _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
4425         if (!_new_mapping_cache)
4426                 return r;
4427
4428         r = dm_register_target(&thin_target);
4429         if (r)
4430                 goto bad_new_mapping_cache;
4431
4432         r = dm_register_target(&pool_target);
4433         if (r)
4434                 goto bad_thin_target;
4435
4436         return 0;
4437
4438 bad_thin_target:
4439         dm_unregister_target(&thin_target);
4440 bad_new_mapping_cache:
4441         kmem_cache_destroy(_new_mapping_cache);
4442
4443         return r;
4444 }
4445
4446 static void dm_thin_exit(void)
4447 {
4448         dm_unregister_target(&thin_target);
4449         dm_unregister_target(&pool_target);
4450
4451         kmem_cache_destroy(_new_mapping_cache);
4452
4453         pool_table_exit();
4454 }
4455
4456 module_init(dm_thin_init);
4457 module_exit(dm_thin_exit);
4458
4459 module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR);
4460 MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
4461
4462 MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
4463 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
4464 MODULE_LICENSE("GPL");