fs/btrfs/scrub.c

   1 /*
   2  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18
  19 #include <linux/blkdev.h>
  20 #include <linux/ratelimit.h>
  21 #include <linux/sched/mm.h>
  22 #include "ctree.h"
  23 #include "volumes.h"
  24 #include "disk-io.h"
  25 #include "ordered-data.h"
  26 #include "transaction.h"
  27 #include "backref.h"
  28 #include "extent_io.h"
  29 #include "dev-replace.h"
  30 #include "check-integrity.h"
  31 #include "rcu-string.h"
  32 #include "raid56.h"
  33
  34 /*
  35  * This is only the first step towards a full-features scrub. It reads all
  36  * extent and super block and verifies the checksums. In case a bad checksum
  37  * is found or the extent cannot be read, good data will be written back if
  38  * any can be found.
  39  *
  40  * Future enhancements:
  41  *  - In case an unrepairable extent is encountered, track which files are
  42  *    affected and report them
  43  *  - track and record media errors, throw out bad devices
  44  *  - add a mode to also read unallocated space
  45  */
  46
  47 struct scrub_block;
  48 struct scrub_ctx;
  49
  50 /*
  51  * the following three values only influence the performance.
  52  * The last one configures the number of parallel and outstanding I/O
  53  * operations. The first two values configure an upper limit for the number
  54  * of (dynamically allocated) pages that are added to a bio.
  55  */
  56 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
  57 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
  58 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
  59
  60 /*
  61  * the following value times PAGE_SIZE needs to be large enough to match the
  62  * largest node/leaf/sector size that shall be supported.
  63  * Values larger than BTRFS_STRIPE_LEN are not supported.
  64  */
  65 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
  66
  67 struct scrub_recover {
  68         refcount_t              refs;
  69         struct btrfs_bio        *bbio;
  70         u64                     map_length;
  71 };
  72
  73 struct scrub_page {
  74         struct scrub_block      *sblock;
  75         struct page             *page;
  76         struct btrfs_device     *dev;
  77         struct list_head        list;
  78         u64                     flags;  /* extent flags */
  79         u64                     generation;
  80         u64                     logical;
  81         u64                     physical;
  82         u64                     physical_for_dev_replace;
  83         atomic_t                refs;
  84         struct {
  85                 unsigned int    mirror_num:8;
  86                 unsigned int    have_csum:1;
  87                 unsigned int    io_error:1;
  88         };
  89         u8                      csum[BTRFS_CSUM_SIZE];
  90
  91         struct scrub_recover    *recover;
  92 };
  93
  94 struct scrub_bio {
  95         int                     index;
  96         struct scrub_ctx        *sctx;
  97         struct btrfs_device     *dev;
  98         struct bio              *bio;
  99         blk_status_t            status;
 100         u64                     logical;
 101         u64                     physical;
 102 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
 103         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
 104 #else
 105         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
 106 #endif
 107         int                     page_count;
 108         int                     next_free;
 109         struct btrfs_work       work;
 110 };
 111
 112 struct scrub_block {
 113         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
 114         int                     page_count;
 115         atomic_t                outstanding_pages;
 116         refcount_t              refs; /* free mem on transition to zero */
 117         struct scrub_ctx        *sctx;
 118         struct scrub_parity     *sparity;
 119         struct {
 120                 unsigned int    header_error:1;
 121                 unsigned int    checksum_error:1;
 122                 unsigned int    no_io_error_seen:1;
 123                 unsigned int    generation_error:1; /* also sets header_error */
 124
 125                 /* The following is for the data used to check parity */
 126                 /* It is for the data with checksum */
 127                 unsigned int    data_corrected:1;
 128         };
 129         struct btrfs_work       work;
 130 };
 131
 132 /* Used for the chunks with parity stripe such RAID5/6 */
 133 struct scrub_parity {
 134         struct scrub_ctx        *sctx;
 135
 136         struct btrfs_device     *scrub_dev;
 137
 138         u64                     logic_start;
 139
 140         u64                     logic_end;
 141
 142         int                     nsectors;
 143
 144         u64                     stripe_len;
 145
 146         refcount_t              refs;
 147
 148         struct list_head        spages;
 149
 150         /* Work of parity check and repair */
 151         struct btrfs_work       work;
 152
 153         /* Mark the parity blocks which have data */
 154         unsigned long           *dbitmap;
 155
 156         /*
 157          * Mark the parity blocks which have data, but errors happen when
 158          * read data or check data
 159          */
 160         unsigned long           *ebitmap;
 161
 162         unsigned long           bitmap[0];
 163 };
 164
 165 struct scrub_ctx {
 166         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
 167         struct btrfs_fs_info    *fs_info;
 168         int                     first_free;
 169         int                     curr;
 170         atomic_t                bios_in_flight;
 171         atomic_t                workers_pending;
 172         spinlock_t              list_lock;
 173         wait_queue_head_t       list_wait;
 174         u16                     csum_size;
 175         struct list_head        csum_list;
 176         atomic_t                cancel_req;
 177         int                     readonly;
 178         int                     pages_per_rd_bio;
 179
 180         int                     is_dev_replace;
 181
 182         struct scrub_bio        *wr_curr_bio;
 183         struct mutex            wr_lock;
 184         int                     pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
 185         struct btrfs_device     *wr_tgtdev;
 186         bool                    flush_all_writes;
 187
 188         /*
 189          * statistics
 190          */
 191         struct btrfs_scrub_progress stat;
 192         spinlock_t              stat_lock;
 193
 194         /*
 195          * Use a ref counter to avoid use-after-free issues. Scrub workers
 196          * decrement bios_in_flight and workers_pending and then do a wakeup
 197          * on the list_wait wait queue. We must ensure the main scrub task
 198          * doesn't free the scrub context before or while the workers are
 199          * doing the wakeup() call.
 200          */
 201         refcount_t              refs;
 202 };
 203
 204 struct scrub_fixup_nodatasum {
 205         struct scrub_ctx        *sctx;
 206         struct btrfs_device     *dev;
 207         u64                     logical;
 208         struct btrfs_root       *root;
 209         struct btrfs_work       work;
 210         int                     mirror_num;
 211 };
 212
 213 struct scrub_nocow_inode {
 214         u64                     inum;
 215         u64                     offset;
 216         u64                     root;
 217         struct list_head        list;
 218 };
 219
 220 struct scrub_copy_nocow_ctx {
 221         struct scrub_ctx        *sctx;
 222         u64                     logical;
 223         u64                     len;
 224         int                     mirror_num;
 225         u64                     physical_for_dev_replace;
 226         struct list_head        inodes;
 227         struct btrfs_work       work;
 228 };
 229
 230 struct scrub_warning {
 231         struct btrfs_path       *path;
 232         u64                     extent_item_size;
 233         const char              *errstr;
 234         u64                     physical;
 235         u64                     logical;
 236         struct btrfs_device     *dev;
 237 };
 238
 239 struct full_stripe_lock {
 240         struct rb_node node;
 241         u64 logical;
 242         u64 refs;
 243         struct mutex mutex;
 244 };
 245
 246 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 247 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 248 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
 249 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 250 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
 251 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 252                                      struct scrub_block *sblocks_for_recheck);
 253 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 254                                 struct scrub_block *sblock,
 255                                 int retry_failed_mirror);
 256 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
 257 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 258                                              struct scrub_block *sblock_good);
 259 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 260                                             struct scrub_block *sblock_good,
 261                                             int page_num, int force_write);
 262 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 263 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
 264                                            int page_num);
 265 static int scrub_checksum_data(struct scrub_block *sblock);
 266 static int scrub_checksum_tree_block(struct scrub_block *sblock);
 267 static int scrub_checksum_super(struct scrub_block *sblock);
 268 static void scrub_block_get(struct scrub_block *sblock);
 269 static void scrub_block_put(struct scrub_block *sblock);
 270 static void scrub_page_get(struct scrub_page *spage);
 271 static void scrub_page_put(struct scrub_page *spage);
 272 static void scrub_parity_get(struct scrub_parity *sparity);
 273 static void scrub_parity_put(struct scrub_parity *sparity);
 274 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
 275                                     struct scrub_page *spage);
 276 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 277                        u64 physical, struct btrfs_device *dev, u64 flags,
 278                        u64 gen, int mirror_num, u8 *csum, int force,
 279                        u64 physical_for_dev_replace);
 280 static void scrub_bio_end_io(struct bio *bio);
 281 static void scrub_bio_end_io_worker(struct btrfs_work *work);
 282 static void scrub_block_complete(struct scrub_block *sblock);
 283 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 284                                u64 extent_logical, u64 extent_len,
 285                                u64 *extent_physical,
 286                                struct btrfs_device **extent_dev,
 287                                int *extent_mirror_num);
 288 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 289                                     struct scrub_page *spage);
 290 static void scrub_wr_submit(struct scrub_ctx *sctx);
 291 static void scrub_wr_bio_end_io(struct bio *bio);
 292 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 293 static int write_page_nocow(struct scrub_ctx *sctx,
 294                             u64 physical_for_dev_replace, struct page *page);
 295 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 296                                       struct scrub_copy_nocow_ctx *ctx);
 297 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 298                             int mirror_num, u64 physical_for_dev_replace);
 299 static void copy_nocow_pages_worker(struct btrfs_work *work);
 300 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 301 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 302 static void scrub_put_ctx(struct scrub_ctx *sctx);
 303
 304 static inline int scrub_is_page_on_raid56(struct scrub_page *page)
 305 {
 306         return page->recover &&
 307                (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
 308 }
 309
 310 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 311 {
 312         refcount_inc(&sctx->refs);
 313         atomic_inc(&sctx->bios_in_flight);
 314 }
 315
 316 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 317 {
 318         atomic_dec(&sctx->bios_in_flight);
 319         wake_up(&sctx->list_wait);
 320         scrub_put_ctx(sctx);
 321 }
 322
 323 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 324 {
 325         while (atomic_read(&fs_info->scrub_pause_req)) {
 326                 mutex_unlock(&fs_info->scrub_lock);
 327                 wait_event(fs_info->scrub_pause_wait,
 328                    atomic_read(&fs_info->scrub_pause_req) == 0);
 329                 mutex_lock(&fs_info->scrub_lock);
 330         }
 331 }
 332
 333 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
 334 {
 335         atomic_inc(&fs_info->scrubs_paused);
 336         wake_up(&fs_info->scrub_pause_wait);
 337 }
 338
 339 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
 340 {
 341         mutex_lock(&fs_info->scrub_lock);
 342         __scrub_blocked_if_needed(fs_info);
 343         atomic_dec(&fs_info->scrubs_paused);
 344         mutex_unlock(&fs_info->scrub_lock);
 345
 346         wake_up(&fs_info->scrub_pause_wait);
 347 }
 348
 349 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 350 {
 351         scrub_pause_on(fs_info);
 352         scrub_pause_off(fs_info);
 353 }
 354
 355 /*
 356  * Insert new full stripe lock into full stripe locks tree
 357  *
 358  * Return pointer to existing or newly inserted full_stripe_lock structure if
 359  * everything works well.
 360  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
 361  *
 362  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
 363  * function
 364  */
 365 static struct full_stripe_lock *insert_full_stripe_lock(
 366                 struct btrfs_full_stripe_locks_tree *locks_root,
 367                 u64 fstripe_logical)
 368 {
 369         struct rb_node **p;
 370         struct rb_node *parent = NULL;
 371         struct full_stripe_lock *entry;
 372         struct full_stripe_lock *ret;
 373
 374         WARN_ON(!mutex_is_locked(&locks_root->lock));
 375
 376         p = &locks_root->root.rb_node;
 377         while (*p) {
 378                 parent = *p;
 379                 entry = rb_entry(parent, struct full_stripe_lock, node);
 380                 if (fstripe_logical < entry->logical) {
 381                         p = &(*p)->rb_left;
 382                 } else if (fstripe_logical > entry->logical) {
 383                         p = &(*p)->rb_right;
 384                 } else {
 385                         entry->refs++;
 386                         return entry;
 387                 }
 388         }
 389
 390         /* Insert new lock */
 391         ret = kmalloc(sizeof(*ret), GFP_KERNEL);
 392         if (!ret)
 393                 return ERR_PTR(-ENOMEM);
 394         ret->logical = fstripe_logical;
 395         ret->refs = 1;
 396         mutex_init(&ret->mutex);
 397
 398         rb_link_node(&ret->node, parent, p);
 399         rb_insert_color(&ret->node, &locks_root->root);
 400         return ret;
 401 }
 402
 403 /*
 404  * Search for a full stripe lock of a block group
 405  *
 406  * Return pointer to existing full stripe lock if found
 407  * Return NULL if not found
 408  */
 409 static struct full_stripe_lock *search_full_stripe_lock(
 410                 struct btrfs_full_stripe_locks_tree *locks_root,
 411                 u64 fstripe_logical)
 412 {
 413         struct rb_node *node;
 414         struct full_stripe_lock *entry;
 415
 416         WARN_ON(!mutex_is_locked(&locks_root->lock));
 417
 418         node = locks_root->root.rb_node;
 419         while (node) {
 420                 entry = rb_entry(node, struct full_stripe_lock, node);
 421                 if (fstripe_logical < entry->logical)
 422                         node = node->rb_left;
 423                 else if (fstripe_logical > entry->logical)
 424                         node = node->rb_right;
 425                 else
 426                         return entry;
 427         }
 428         return NULL;
 429 }
 430
 431 /*
 432  * Helper to get full stripe logical from a normal bytenr.
 433  *
 434  * Caller must ensure @cache is a RAID56 block group.
 435  */
 436 static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
 437                                    u64 bytenr)
 438 {
 439         u64 ret;
 440
 441         /*
 442          * Due to chunk item size limit, full stripe length should not be
 443          * larger than U32_MAX. Just a sanity check here.
 444          */
 445         WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
 446
 447         /*
 448          * round_down() can only handle power of 2, while RAID56 full
 449          * stripe length can be 64KiB * n, so we need to manually round down.
 450          */
 451         ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
 452                 cache->full_stripe_len + cache->key.objectid;
 453         return ret;
 454 }
 455
 456 /*
 457  * Lock a full stripe to avoid concurrency of recovery and read
 458  *
 459  * It's only used for profiles with parities (RAID5/6), for other profiles it
 460  * does nothing.
 461  *
 462  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
 463  * So caller must call unlock_full_stripe() at the same context.
 464  *
 465  * Return <0 if encounters error.
 466  */
 467 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 468                             bool *locked_ret)
 469 {
 470         struct btrfs_block_group_cache *bg_cache;
 471         struct btrfs_full_stripe_locks_tree *locks_root;
 472         struct full_stripe_lock *existing;
 473         u64 fstripe_start;
 474         int ret = 0;
 475
 476         *locked_ret = false;
 477         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 478         if (!bg_cache) {
 479                 ASSERT(0);
 480                 return -ENOENT;
 481         }
 482
 483         /* Profiles not based on parity don't need full stripe lock */
 484         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 485                 goto out;
 486         locks_root = &bg_cache->full_stripe_locks_root;
 487
 488         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 489
 490         /* Now insert the full stripe lock */
 491         mutex_lock(&locks_root->lock);
 492         existing = insert_full_stripe_lock(locks_root, fstripe_start);
 493         mutex_unlock(&locks_root->lock);
 494         if (IS_ERR(existing)) {
 495                 ret = PTR_ERR(existing);
 496                 goto out;
 497         }
 498         mutex_lock(&existing->mutex);
 499         *locked_ret = true;
 500 out:
 501         btrfs_put_block_group(bg_cache);
 502         return ret;
 503 }
 504
 505 /*
 506  * Unlock a full stripe.
 507  *
 508  * NOTE: Caller must ensure it's the same context calling corresponding
 509  * lock_full_stripe().
 510  *
 511  * Return 0 if we unlock full stripe without problem.
 512  * Return <0 for error
 513  */
 514 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 515                               bool locked)
 516 {
 517         struct btrfs_block_group_cache *bg_cache;
 518         struct btrfs_full_stripe_locks_tree *locks_root;
 519         struct full_stripe_lock *fstripe_lock;
 520         u64 fstripe_start;
 521         bool freeit = false;
 522         int ret = 0;
 523
 524         /* If we didn't acquire full stripe lock, no need to continue */
 525         if (!locked)
 526                 return 0;
 527
 528         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 529         if (!bg_cache) {
 530                 ASSERT(0);
 531                 return -ENOENT;
 532         }
 533         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 534                 goto out;
 535
 536         locks_root = &bg_cache->full_stripe_locks_root;
 537         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 538
 539         mutex_lock(&locks_root->lock);
 540         fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
 541         /* Unpaired unlock_full_stripe() detected */
 542         if (!fstripe_lock) {
 543                 WARN_ON(1);
 544                 ret = -ENOENT;
 545                 mutex_unlock(&locks_root->lock);
 546                 goto out;
 547         }
 548
 549         if (fstripe_lock->refs == 0) {
 550                 WARN_ON(1);
 551                 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
 552                         fstripe_lock->logical);
 553         } else {
 554                 fstripe_lock->refs--;
 555         }
 556
 557         if (fstripe_lock->refs == 0) {
 558                 rb_erase(&fstripe_lock->node, &locks_root->root);
 559                 freeit = true;
 560         }
 561         mutex_unlock(&locks_root->lock);
 562
 563         mutex_unlock(&fstripe_lock->mutex);
 564         if (freeit)
 565                 kfree(fstripe_lock);
 566 out:
 567         btrfs_put_block_group(bg_cache);
 568         return ret;
 569 }
 570
 571 /*
 572  * used for workers that require transaction commits (i.e., for the
 573  * NOCOW case)
 574  */
 575 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 576 {
 577         struct btrfs_fs_info *fs_info = sctx->fs_info;
 578
 579         refcount_inc(&sctx->refs);
 580         /*
 581          * increment scrubs_running to prevent cancel requests from
 582          * completing as long as a worker is running. we must also
 583          * increment scrubs_paused to prevent deadlocking on pause
 584          * requests used for transactions commits (as the worker uses a
 585          * transaction context). it is safe to regard the worker
 586          * as paused for all matters practical. effectively, we only
 587          * avoid cancellation requests from completing.
 588          */
 589         mutex_lock(&fs_info->scrub_lock);
 590         atomic_inc(&fs_info->scrubs_running);
 591         atomic_inc(&fs_info->scrubs_paused);
 592         mutex_unlock(&fs_info->scrub_lock);
 593
 594         /*
 595          * check if @scrubs_running=@scrubs_paused condition
 596          * inside wait_event() is not an atomic operation.
 597          * which means we may inc/dec @scrub_running/paused
 598          * at any time. Let's wake up @scrub_pause_wait as
 599          * much as we can to let commit transaction blocked less.
 600          */
 601         wake_up(&fs_info->scrub_pause_wait);
 602
 603         atomic_inc(&sctx->workers_pending);
 604 }
 605
 606 /* used for workers that require transaction commits */
 607 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
 608 {
 609         struct btrfs_fs_info *fs_info = sctx->fs_info;
 610
 611         /*
 612          * see scrub_pending_trans_workers_inc() why we're pretending
 613          * to be paused in the scrub counters
 614          */
 615         mutex_lock(&fs_info->scrub_lock);
 616         atomic_dec(&fs_info->scrubs_running);
 617         atomic_dec(&fs_info->scrubs_paused);
 618         mutex_unlock(&fs_info->scrub_lock);
 619         atomic_dec(&sctx->workers_pending);
 620         wake_up(&fs_info->scrub_pause_wait);
 621         wake_up(&sctx->list_wait);
 622         scrub_put_ctx(sctx);
 623 }
 624
 625 static void scrub_free_csums(struct scrub_ctx *sctx)
 626 {
 627         while (!list_empty(&sctx->csum_list)) {
 628                 struct btrfs_ordered_sum *sum;
 629                 sum = list_first_entry(&sctx->csum_list,
 630                                        struct btrfs_ordered_sum, list);
 631                 list_del(&sum->list);
 632                 kfree(sum);
 633         }
 634 }
 635
 636 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 637 {
 638         int i;
 639
 640         if (!sctx)
 641                 return;
 642
 643         /* this can happen when scrub is cancelled */
 644         if (sctx->curr != -1) {
 645                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
 646
 647                 for (i = 0; i < sbio->page_count; i++) {
 648                         WARN_ON(!sbio->pagev[i]->page);
 649                         scrub_block_put(sbio->pagev[i]->sblock);
 650                 }
 651                 bio_put(sbio->bio);
 652         }
 653
 654         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 655                 struct scrub_bio *sbio = sctx->bios[i];
 656
 657                 if (!sbio)
 658                         break;
 659                 kfree(sbio);
 660         }
 661
 662         kfree(sctx->wr_curr_bio);
 663         scrub_free_csums(sctx);
 664         kfree(sctx);
 665 }
 666
 667 static void scrub_put_ctx(struct scrub_ctx *sctx)
 668 {
 669         if (refcount_dec_and_test(&sctx->refs))
 670                 scrub_free_ctx(sctx);
 671 }
 672
 673 static noinline_for_stack
 674 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 675 {
 676         struct scrub_ctx *sctx;
 677         int             i;
 678         struct btrfs_fs_info *fs_info = dev->fs_info;
 679
 680         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
 681         if (!sctx)
 682                 goto nomem;
 683         refcount_set(&sctx->refs, 1);
 684         sctx->is_dev_replace = is_dev_replace;
 685         sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 686         sctx->curr = -1;
 687         sctx->fs_info = dev->fs_info;
 688         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 689                 struct scrub_bio *sbio;
 690
 691                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
 692                 if (!sbio)
 693                         goto nomem;
 694                 sctx->bios[i] = sbio;
 695
 696                 sbio->index = i;
 697                 sbio->sctx = sctx;
 698                 sbio->page_count = 0;
 699                 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
 700                                 scrub_bio_end_io_worker, NULL, NULL);
 701
 702                 if (i != SCRUB_BIOS_PER_SCTX - 1)
 703                         sctx->bios[i]->next_free = i + 1;
 704                 else
 705                         sctx->bios[i]->next_free = -1;
 706         }
 707         sctx->first_free = 0;
 708         atomic_set(&sctx->bios_in_flight, 0);
 709         atomic_set(&sctx->workers_pending, 0);
 710         atomic_set(&sctx->cancel_req, 0);
 711         sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
 712         INIT_LIST_HEAD(&sctx->csum_list);
 713
 714         spin_lock_init(&sctx->list_lock);
 715         spin_lock_init(&sctx->stat_lock);
 716         init_waitqueue_head(&sctx->list_wait);
 717
 718         WARN_ON(sctx->wr_curr_bio != NULL);
 719         mutex_init(&sctx->wr_lock);
 720         sctx->wr_curr_bio = NULL;
 721         if (is_dev_replace) {
 722                 WARN_ON(!fs_info->dev_replace.tgtdev);
 723                 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
 724                 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
 725                 sctx->flush_all_writes = false;
 726         }
 727
 728         return sctx;
 729
 730 nomem:
 731         scrub_free_ctx(sctx);
 732         return ERR_PTR(-ENOMEM);
 733 }
 734
 735 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 736                                      void *warn_ctx)
 737 {
 738         u64 isize;
 739         u32 nlink;
 740         int ret;
 741         int i;
 742         unsigned nofs_flag;
 743         struct extent_buffer *eb;
 744         struct btrfs_inode_item *inode_item;
 745         struct scrub_warning *swarn = warn_ctx;
 746         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
 747         struct inode_fs_paths *ipath = NULL;
 748         struct btrfs_root *local_root;
 749         struct btrfs_key root_key;
 750         struct btrfs_key key;
 751
 752         root_key.objectid = root;
 753         root_key.type = BTRFS_ROOT_ITEM_KEY;
 754         root_key.offset = (u64)-1;
 755         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
 756         if (IS_ERR(local_root)) {
 757                 ret = PTR_ERR(local_root);
 758                 goto err;
 759         }
 760
 761         /*
 762          * this makes the path point to (inum INODE_ITEM ioff)
 763          */
 764         key.objectid = inum;
 765         key.type = BTRFS_INODE_ITEM_KEY;
 766         key.offset = 0;
 767
 768         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
 769         if (ret) {
 770                 btrfs_release_path(swarn->path);
 771                 goto err;
 772         }
 773
 774         eb = swarn->path->nodes[0];
 775         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 776                                         struct btrfs_inode_item);
 777         isize = btrfs_inode_size(eb, inode_item);
 778         nlink = btrfs_inode_nlink(eb, inode_item);
 779         btrfs_release_path(swarn->path);
 780
 781         /*
 782          * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
 783          * uses GFP_NOFS in this context, so we keep it consistent but it does
 784          * not seem to be strictly necessary.
 785          */
 786         nofs_flag = memalloc_nofs_save();
 787         ipath = init_ipath(4096, local_root, swarn->path);
 788         memalloc_nofs_restore(nofs_flag);
 789         if (IS_ERR(ipath)) {
 790                 ret = PTR_ERR(ipath);
 791                 ipath = NULL;
 792                 goto err;
 793         }
 794         ret = paths_from_inode(inum, ipath);
 795
 796         if (ret < 0)
 797                 goto err;
 798
 799         /*
 800          * we deliberately ignore the bit ipath might have been too small to
 801          * hold all of the paths here
 802          */
 803         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 804                 btrfs_warn_in_rcu(fs_info,
 805 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
 806                                   swarn->errstr, swarn->logical,
 807                                   rcu_str_deref(swarn->dev->name),
 808                                   swarn->physical,
 809                                   root, inum, offset,
 810                                   min(isize - offset, (u64)PAGE_SIZE), nlink,
 811                                   (char *)(unsigned long)ipath->fspath->val[i]);
 812
 813         free_ipath(ipath);
 814         return 0;
 815
 816 err:
 817         btrfs_warn_in_rcu(fs_info,
 818                           "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
 819                           swarn->errstr, swarn->logical,
 820                           rcu_str_deref(swarn->dev->name),
 821                           swarn->physical,
 822                           root, inum, offset, ret);
 823
 824         free_ipath(ipath);
 825         return 0;
 826 }
 827
 828 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 829 {
 830         struct btrfs_device *dev;
 831         struct btrfs_fs_info *fs_info;
 832         struct btrfs_path *path;
 833         struct btrfs_key found_key;
 834         struct extent_buffer *eb;
 835         struct btrfs_extent_item *ei;
 836         struct scrub_warning swarn;
 837         unsigned long ptr = 0;
 838         u64 extent_item_pos;
 839         u64 flags = 0;
 840         u64 ref_root;
 841         u32 item_size;
 842         u8 ref_level = 0;
 843         int ret;
 844
 845         WARN_ON(sblock->page_count < 1);
 846         dev = sblock->pagev[0]->dev;
 847         fs_info = sblock->sctx->fs_info;
 848
 849         path = btrfs_alloc_path();
 850         if (!path)
 851                 return;
 852
 853         swarn.physical = sblock->pagev[0]->physical;
 854         swarn.logical = sblock->pagev[0]->logical;
 855         swarn.errstr = errstr;
 856         swarn.dev = NULL;
 857
 858         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 859                                   &flags);
 860         if (ret < 0)
 861                 goto out;
 862
 863         extent_item_pos = swarn.logical - found_key.objectid;
 864         swarn.extent_item_size = found_key.offset;
 865
 866         eb = path->nodes[0];
 867         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 868         item_size = btrfs_item_size_nr(eb, path->slots[0]);
 869
 870         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 871                 do {
 872                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 873                                                       item_size, &ref_root,
 874                                                       &ref_level);
 875                         btrfs_warn_in_rcu(fs_info,
 876 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
 877                                 errstr, swarn.logical,
 878                                 rcu_str_deref(dev->name),
 879                                 swarn.physical,
 880                                 ref_level ? "node" : "leaf",
 881                                 ret < 0 ? -1 : ref_level,
 882                                 ret < 0 ? -1 : ref_root);
 883                 } while (ret != 1);
 884                 btrfs_release_path(path);
 885         } else {
 886                 btrfs_release_path(path);
 887                 swarn.path = path;
 888                 swarn.dev = dev;
 889                 iterate_extent_inodes(fs_info, found_key.objectid,
 890                                         extent_item_pos, 1,
 891                                         scrub_print_warning_inode, &swarn, false);
 892         }
 893
 894 out:
 895         btrfs_free_path(path);
 896 }
 897
 898 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 899 {
 900         struct page *page = NULL;
 901         unsigned long index;
 902         struct scrub_fixup_nodatasum *fixup = fixup_ctx;
 903         int ret;
 904         int corrected = 0;
 905         struct btrfs_key key;
 906         struct inode *inode = NULL;
 907         struct btrfs_fs_info *fs_info;
 908         u64 end = offset + PAGE_SIZE - 1;
 909         struct btrfs_root *local_root;
 910         int srcu_index;
 911
 912         key.objectid = root;
 913         key.type = BTRFS_ROOT_ITEM_KEY;
 914         key.offset = (u64)-1;
 915
 916         fs_info = fixup->root->fs_info;
 917         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
 918
 919         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
 920         if (IS_ERR(local_root)) {
 921                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 922                 return PTR_ERR(local_root);
 923         }
 924
 925         key.type = BTRFS_INODE_ITEM_KEY;
 926         key.objectid = inum;
 927         key.offset = 0;
 928         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
 929         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 930         if (IS_ERR(inode))
 931                 return PTR_ERR(inode);
 932
 933         index = offset >> PAGE_SHIFT;
 934
 935         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 936         if (!page) {
 937                 ret = -ENOMEM;
 938                 goto out;
 939         }
 940
 941         if (PageUptodate(page)) {
 942                 if (PageDirty(page)) {
 943                         /*
 944                          * we need to write the data to the defect sector. the
 945                          * data that was in that sector is not in memory,
 946                          * because the page was modified. we must not write the
 947                          * modified page to that sector.
 948                          *
 949                          * TODO: what could be done here: wait for the delalloc
 950                          *       runner to write out that page (might involve
 951                          *       COW) and see whether the sector is still
 952                          *       referenced afterwards.
 953                          *
 954                          * For the meantime, we'll treat this error
 955                          * incorrectable, although there is a chance that a
 956                          * later scrub will find the bad sector again and that
 957                          * there's no dirty page in memory, then.
 958                          */
 959                         ret = -EIO;
 960                         goto out;
 961                 }
 962                 ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE,
 963                                         fixup->logical, page,
 964                                         offset - page_offset(page),
 965                                         fixup->mirror_num);
 966                 unlock_page(page);
 967                 corrected = !ret;
 968         } else {
 969                 /*
 970                  * we need to get good data first. the general readpage path
 971                  * will call repair_io_failure for us, we just have to make
 972                  * sure we read the bad mirror.
 973                  */
 974                 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 975                                         EXTENT_DAMAGED);
 976                 if (ret) {
 977                         /* set_extent_bits should give proper error */
 978                         WARN_ON(ret > 0);
 979                         if (ret > 0)
 980                                 ret = -EFAULT;
 981                         goto out;
 982                 }
 983
 984                 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
 985                                                 btrfs_get_extent,
 986                                                 fixup->mirror_num);
 987                 wait_on_page_locked(page);
 988
 989                 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
 990                                                 end, EXTENT_DAMAGED, 0, NULL);
 991                 if (!corrected)
 992                         clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 993                                                 EXTENT_DAMAGED);
 994         }
 995
 996 out:
 997         if (page)
 998                 put_page(page);
 999
1000         iput(inode);
1001
1002         if (ret < 0)
1003                 return ret;
1004
1005         if (ret == 0 && corrected) {
1006                 /*
1007                  * we only need to call readpage for one of the inodes belonging
1008                  * to this extent. so make iterate_extent_inodes stop
1009                  */
1010                 return 1;
1011         }
1012
1013         return -EIO;
1014 }
1015
1016 static void scrub_fixup_nodatasum(struct btrfs_work *work)
1017 {
1018         struct btrfs_fs_info *fs_info;
1019         int ret;
1020         struct scrub_fixup_nodatasum *fixup;
1021         struct scrub_ctx *sctx;
1022         struct btrfs_trans_handle *trans = NULL;
1023         struct btrfs_path *path;
1024         int uncorrectable = 0;
1025
1026         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
1027         sctx = fixup->sctx;
1028         fs_info = fixup->root->fs_info;
1029
1030         path = btrfs_alloc_path();
1031         if (!path) {
1032                 spin_lock(&sctx->stat_lock);
1033                 ++sctx->stat.malloc_errors;
1034                 spin_unlock(&sctx->stat_lock);
1035                 uncorrectable = 1;
1036                 goto out;
1037         }
1038
1039         trans = btrfs_join_transaction(fixup->root);
1040         if (IS_ERR(trans)) {
1041                 uncorrectable = 1;
1042                 goto out;
1043         }
1044
1045         /*
1046          * the idea is to trigger a regular read through the standard path. we
1047          * read a page from the (failed) logical address by specifying the
1048          * corresponding copynum of the failed sector. thus, that readpage is
1049          * expected to fail.
1050          * that is the point where on-the-fly error correction will kick in
1051          * (once it's finished) and rewrite the failed sector if a good copy
1052          * can be found.
1053          */
1054         ret = iterate_inodes_from_logical(fixup->logical, fs_info, path,
1055                                           scrub_fixup_readpage, fixup, false);
1056         if (ret < 0) {
1057                 uncorrectable = 1;
1058                 goto out;
1059         }
1060         WARN_ON(ret != 1);
1061
1062         spin_lock(&sctx->stat_lock);
1063         ++sctx->stat.corrected_errors;
1064         spin_unlock(&sctx->stat_lock);
1065
1066 out:
1067         if (trans && !IS_ERR(trans))
1068                 btrfs_end_transaction(trans);
1069         if (uncorrectable) {
1070                 spin_lock(&sctx->stat_lock);
1071                 ++sctx->stat.uncorrectable_errors;
1072                 spin_unlock(&sctx->stat_lock);
1073                 btrfs_dev_replace_stats_inc(
1074                         &fs_info->dev_replace.num_uncorrectable_read_errors);
1075                 btrfs_err_rl_in_rcu(fs_info,
1076                     "unable to fixup (nodatasum) error at logical %llu on dev %s",
1077                         fixup->logical, rcu_str_deref(fixup->dev->name));
1078         }
1079
1080         btrfs_free_path(path);
1081         kfree(fixup);
1082
1083         scrub_pending_trans_workers_dec(sctx);
1084 }
1085
1086 static inline void scrub_get_recover(struct scrub_recover *recover)
1087 {
1088         refcount_inc(&recover->refs);
1089 }
1090
1091 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
1092                                      struct scrub_recover *recover)
1093 {
1094         if (refcount_dec_and_test(&recover->refs)) {
1095                 btrfs_bio_counter_dec(fs_info);
1096                 btrfs_put_bbio(recover->bbio);
1097                 kfree(recover);
1098         }
1099 }
1100
1101 /*
1102  * scrub_handle_errored_block gets called when either verification of the
1103  * pages failed or the bio failed to read, e.g. with EIO. In the latter
1104  * case, this function handles all pages in the bio, even though only one
1105  * may be bad.
1106  * The goal of this function is to repair the errored block by using the
1107  * contents of one of the mirrors.
1108  */
1109 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
1110 {
1111         struct scrub_ctx *sctx = sblock_to_check->sctx;
1112         struct btrfs_device *dev;
1113         struct btrfs_fs_info *fs_info;
1114         u64 length;
1115         u64 logical;
1116         unsigned int failed_mirror_index;
1117         unsigned int is_metadata;
1118         unsigned int have_csum;
1119         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
1120         struct scrub_block *sblock_bad;
1121         int ret;
1122         int mirror_index;
1123         int page_num;
1124         int success;
1125         bool full_stripe_locked;
1126         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1127                                       DEFAULT_RATELIMIT_BURST);
1128
1129         BUG_ON(sblock_to_check->page_count < 1);
1130         fs_info = sctx->fs_info;
1131         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
1132                 /*
1133                  * if we find an error in a super block, we just report it.
1134                  * They will get written with the next transaction commit
1135                  * anyway
1136                  */
1137                 spin_lock(&sctx->stat_lock);
1138                 ++sctx->stat.super_errors;
1139                 spin_unlock(&sctx->stat_lock);
1140                 return 0;
1141         }
1142         length = sblock_to_check->page_count * PAGE_SIZE;
1143         logical = sblock_to_check->pagev[0]->logical;
1144         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
1145         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
1146         is_metadata = !(sblock_to_check->pagev[0]->flags &
1147                         BTRFS_EXTENT_FLAG_DATA);
1148         have_csum = sblock_to_check->pagev[0]->have_csum;
1149         dev = sblock_to_check->pagev[0]->dev;
1150
1151         /*
1152          * For RAID5/6, race can happen for a different device scrub thread.
1153          * For data corruption, Parity and Data threads will both try
1154          * to recovery the data.
1155          * Race can lead to doubly added csum error, or even unrecoverable
1156          * error.
1157          */
1158         ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
1159         if (ret < 0) {
1160                 spin_lock(&sctx->stat_lock);
1161                 if (ret == -ENOMEM)
1162                         sctx->stat.malloc_errors++;
1163                 sctx->stat.read_errors++;
1164                 sctx->stat.uncorrectable_errors++;
1165                 spin_unlock(&sctx->stat_lock);
1166                 return ret;
1167         }
1168
1169         if (sctx->is_dev_replace && !is_metadata && !have_csum) {
1170                 sblocks_for_recheck = NULL;
1171                 goto nodatasum_case;
1172         }
1173
1174         /*
1175          * read all mirrors one after the other. This includes to
1176          * re-read the extent or metadata block that failed (that was
1177          * the cause that this fixup code is called) another time,
1178          * page by page this time in order to know which pages
1179          * caused I/O errors and which ones are good (for all mirrors).
1180          * It is the goal to handle the situation when more than one
1181          * mirror contains I/O errors, but the errors do not
1182          * overlap, i.e. the data can be repaired by selecting the
1183          * pages from those mirrors without I/O error on the
1184          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
1185          * would be that mirror #1 has an I/O error on the first page,
1186          * the second page is good, and mirror #2 has an I/O error on
1187          * the second page, but the first page is good.
1188          * Then the first page of the first mirror can be repaired by
1189          * taking the first page of the second mirror, and the
1190          * second page of the second mirror can be repaired by
1191          * copying the contents of the 2nd page of the 1st mirror.
1192          * One more note: if the pages of one mirror contain I/O
1193          * errors, the checksum cannot be verified. In order to get
1194          * the best data for repairing, the first attempt is to find
1195          * a mirror without I/O errors and with a validated checksum.
1196          * Only if this is not possible, the pages are picked from
1197          * mirrors with I/O errors without considering the checksum.
1198          * If the latter is the case, at the end, the checksum of the
1199          * repaired area is verified in order to correctly maintain
1200          * the statistics.
1201          */
1202
1203         sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
1204                                       sizeof(*sblocks_for_recheck), GFP_NOFS);
1205         if (!sblocks_for_recheck) {
1206                 spin_lock(&sctx->stat_lock);
1207                 sctx->stat.malloc_errors++;
1208                 sctx->stat.read_errors++;
1209                 sctx->stat.uncorrectable_errors++;
1210                 spin_unlock(&sctx->stat_lock);
1211                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1212                 goto out;
1213         }
1214
1215         /* setup the context, map the logical blocks and alloc the pages */
1216         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
1217         if (ret) {
1218                 spin_lock(&sctx->stat_lock);
1219                 sctx->stat.read_errors++;
1220                 sctx->stat.uncorrectable_errors++;
1221                 spin_unlock(&sctx->stat_lock);
1222                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1223                 goto out;
1224         }
1225         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
1226         sblock_bad = sblocks_for_recheck + failed_mirror_index;
1227
1228         /* build and submit the bios for the failed mirror, check checksums */
1229         scrub_recheck_block(fs_info, sblock_bad, 1);
1230
1231         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
1232             sblock_bad->no_io_error_seen) {
1233                 /*
1234                  * the error disappeared after reading page by page, or
1235                  * the area was part of a huge bio and other parts of the
1236                  * bio caused I/O errors, or the block layer merged several
1237                  * read requests into one and the error is caused by a
1238                  * different bio (usually one of the two latter cases is
1239                  * the cause)
1240                  */
1241                 spin_lock(&sctx->stat_lock);
1242                 sctx->stat.unverified_errors++;
1243                 sblock_to_check->data_corrected = 1;
1244                 spin_unlock(&sctx->stat_lock);
1245
1246                 if (sctx->is_dev_replace)
1247                         scrub_write_block_to_dev_replace(sblock_bad);
1248                 goto out;
1249         }
1250
1251         if (!sblock_bad->no_io_error_seen) {
1252                 spin_lock(&sctx->stat_lock);
1253                 sctx->stat.read_errors++;
1254                 spin_unlock(&sctx->stat_lock);
1255                 if (__ratelimit(&_rs))
1256                         scrub_print_warning("i/o error", sblock_to_check);
1257                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1258         } else if (sblock_bad->checksum_error) {
1259                 spin_lock(&sctx->stat_lock);
1260                 sctx->stat.csum_errors++;
1261                 spin_unlock(&sctx->stat_lock);
1262                 if (__ratelimit(&_rs))
1263                         scrub_print_warning("checksum error", sblock_to_check);
1264                 btrfs_dev_stat_inc_and_print(dev,
1265                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
1266         } else if (sblock_bad->header_error) {
1267                 spin_lock(&sctx->stat_lock);
1268                 sctx->stat.verify_errors++;
1269                 spin_unlock(&sctx->stat_lock);
1270                 if (__ratelimit(&_rs))
1271                         scrub_print_warning("checksum/header error",
1272                                             sblock_to_check);
1273                 if (sblock_bad->generation_error)
1274                         btrfs_dev_stat_inc_and_print(dev,
1275                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1276                 else
1277                         btrfs_dev_stat_inc_and_print(dev,
1278                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1279         }
1280
1281         if (sctx->readonly) {
1282                 ASSERT(!sctx->is_dev_replace);
1283                 goto out;
1284         }
1285
1286         if (!is_metadata && !have_csum) {
1287                 struct scrub_fixup_nodatasum *fixup_nodatasum;
1288
1289                 WARN_ON(sctx->is_dev_replace);
1290
1291 nodatasum_case:
1292
1293                 /*
1294                  * !is_metadata and !have_csum, this means that the data
1295                  * might not be COWed, that it might be modified
1296                  * concurrently. The general strategy to work on the
1297                  * commit root does not help in the case when COW is not
1298                  * used.
1299                  */
1300                 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1301                 if (!fixup_nodatasum)
1302                         goto did_not_correct_error;
1303                 fixup_nodatasum->sctx = sctx;
1304                 fixup_nodatasum->dev = dev;
1305                 fixup_nodatasum->logical = logical;
1306                 fixup_nodatasum->root = fs_info->extent_root;
1307                 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1308                 scrub_pending_trans_workers_inc(sctx);
1309                 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1310                                 scrub_fixup_nodatasum, NULL, NULL);
1311                 btrfs_queue_work(fs_info->scrub_workers,
1312                                  &fixup_nodatasum->work);
1313                 goto out;
1314         }
1315
1316         /*
1317          * now build and submit the bios for the other mirrors, check
1318          * checksums.
1319          * First try to pick the mirror which is completely without I/O
1320          * errors and also does not have a checksum error.
1321          * If one is found, and if a checksum is present, the full block
1322          * that is known to contain an error is rewritten. Afterwards
1323          * the block is known to be corrected.
1324          * If a mirror is found which is completely correct, and no
1325          * checksum is present, only those pages are rewritten that had
1326          * an I/O error in the block to be repaired, since it cannot be
1327          * determined, which copy of the other pages is better (and it
1328          * could happen otherwise that a correct page would be
1329          * overwritten by a bad one).
1330          */
1331         for (mirror_index = 0; ;mirror_index++) {
1332                 struct scrub_block *sblock_other;
1333
1334                 if (mirror_index == failed_mirror_index)
1335                         continue;
1336
1337                 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1338                 if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1339                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1340                                 break;
1341                         if (!sblocks_for_recheck[mirror_index].page_count)
1342                                 break;
1343
1344                         sblock_other = sblocks_for_recheck + mirror_index;
1345                 } else {
1346                         struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1347                         int max_allowed = r->bbio->num_stripes -
1348                                                 r->bbio->num_tgtdevs;
1349
1350                         if (mirror_index >= max_allowed)
1351                                 break;
1352                         if (!sblocks_for_recheck[1].page_count)
1353                                 break;
1354
1355                         ASSERT(failed_mirror_index == 0);
1356                         sblock_other = sblocks_for_recheck + 1;
1357                         sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1358                 }
1359
1360                 /* build and submit the bios, check checksums */
1361                 scrub_recheck_block(fs_info, sblock_other, 0);
1362
1363                 if (!sblock_other->header_error &&
1364                     !sblock_other->checksum_error &&
1365                     sblock_other->no_io_error_seen) {
1366                         if (sctx->is_dev_replace) {
1367                                 scrub_write_block_to_dev_replace(sblock_other);
1368                                 goto corrected_error;
1369                         } else {
1370                                 ret = scrub_repair_block_from_good_copy(
1371                                                 sblock_bad, sblock_other);
1372                                 if (!ret)
1373                                         goto corrected_error;
1374                         }
1375                 }
1376         }
1377
1378         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1379                 goto did_not_correct_error;
1380
1381         /*
1382          * In case of I/O errors in the area that is supposed to be
1383          * repaired, continue by picking good copies of those pages.
1384          * Select the good pages from mirrors to rewrite bad pages from
1385          * the area to fix. Afterwards verify the checksum of the block
1386          * that is supposed to be repaired. This verification step is
1387          * only done for the purpose of statistic counting and for the
1388          * final scrub report, whether errors remain.
1389          * A perfect algorithm could make use of the checksum and try
1390          * all possible combinations of pages from the different mirrors
1391          * until the checksum verification succeeds. For example, when
1392          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1393          * of mirror #2 is readable but the final checksum test fails,
1394          * then the 2nd page of mirror #3 could be tried, whether now
1395          * the final checksum succeeds. But this would be a rare
1396          * exception and is therefore not implemented. At least it is
1397          * avoided that the good copy is overwritten.
1398          * A more useful improvement would be to pick the sectors
1399          * without I/O error based on sector sizes (512 bytes on legacy
1400          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1401          * mirror could be repaired by taking 512 byte of a different
1402          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1403          * area are unreadable.
1404          */
1405         success = 1;
1406         for (page_num = 0; page_num < sblock_bad->page_count;
1407              page_num++) {
1408                 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1409                 struct scrub_block *sblock_other = NULL;
1410
1411                 /* skip no-io-error page in scrub */
1412                 if (!page_bad->io_error && !sctx->is_dev_replace)
1413                         continue;
1414
1415                 /* try to find no-io-error page in mirrors */
1416                 if (page_bad->io_error) {
1417                         for (mirror_index = 0;
1418                              mirror_index < BTRFS_MAX_MIRRORS &&
1419                              sblocks_for_recheck[mirror_index].page_count > 0;
1420                              mirror_index++) {
1421                                 if (!sblocks_for_recheck[mirror_index].
1422                                     pagev[page_num]->io_error) {
1423                                         sblock_other = sblocks_for_recheck +
1424                                                        mirror_index;
1425                                         break;
1426                                 }
1427                         }
1428                         if (!sblock_other)
1429                                 success = 0;
1430                 }
1431
1432                 if (sctx->is_dev_replace) {
1433                         /*
1434                          * did not find a mirror to fetch the page
1435                          * from. scrub_write_page_to_dev_replace()
1436                          * handles this case (page->io_error), by
1437                          * filling the block with zeros before
1438                          * submitting the write request
1439                          */
1440                         if (!sblock_other)
1441                                 sblock_other = sblock_bad;
1442
1443                         if (scrub_write_page_to_dev_replace(sblock_other,
1444                                                             page_num) != 0) {
1445                                 btrfs_dev_replace_stats_inc(
1446                                         &fs_info->dev_replace.num_write_errors);
1447                                 success = 0;
1448                         }
1449                 } else if (sblock_other) {
1450                         ret = scrub_repair_page_from_good_copy(sblock_bad,
1451                                                                sblock_other,
1452                                                                page_num, 0);
1453                         if (0 == ret)
1454                                 page_bad->io_error = 0;
1455                         else
1456                                 success = 0;
1457                 }
1458         }
1459
1460         if (success && !sctx->is_dev_replace) {
1461                 if (is_metadata || have_csum) {
1462                         /*
1463                          * need to verify the checksum now that all
1464                          * sectors on disk are repaired (the write
1465                          * request for data to be repaired is on its way).
1466                          * Just be lazy and use scrub_recheck_block()
1467                          * which re-reads the data before the checksum
1468                          * is verified, but most likely the data comes out
1469                          * of the page cache.
1470                          */
1471                         scrub_recheck_block(fs_info, sblock_bad, 1);
1472                         if (!sblock_bad->header_error &&
1473                             !sblock_bad->checksum_error &&
1474                             sblock_bad->no_io_error_seen)
1475                                 goto corrected_error;
1476                         else
1477                                 goto did_not_correct_error;
1478                 } else {
1479 corrected_error:
1480                         spin_lock(&sctx->stat_lock);
1481                         sctx->stat.corrected_errors++;
1482                         sblock_to_check->data_corrected = 1;
1483                         spin_unlock(&sctx->stat_lock);
1484                         btrfs_err_rl_in_rcu(fs_info,
1485                                 "fixed up error at logical %llu on dev %s",
1486                                 logical, rcu_str_deref(dev->name));
1487                 }
1488         } else {
1489 did_not_correct_error:
1490                 spin_lock(&sctx->stat_lock);
1491                 sctx->stat.uncorrectable_errors++;
1492                 spin_unlock(&sctx->stat_lock);
1493                 btrfs_err_rl_in_rcu(fs_info,
1494                         "unable to fixup (regular) error at logical %llu on dev %s",
1495                         logical, rcu_str_deref(dev->name));
1496         }
1497
1498 out:
1499         if (sblocks_for_recheck) {
1500                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1501                      mirror_index++) {
1502                         struct scrub_block *sblock = sblocks_for_recheck +
1503                                                      mirror_index;
1504                         struct scrub_recover *recover;
1505                         int page_index;
1506
1507                         for (page_index = 0; page_index < sblock->page_count;
1508                              page_index++) {
1509                                 sblock->pagev[page_index]->sblock = NULL;
1510                                 recover = sblock->pagev[page_index]->recover;
1511                                 if (recover) {
1512                                         scrub_put_recover(fs_info, recover);
1513                                         sblock->pagev[page_index]->recover =
1514                                                                         NULL;
1515                                 }
1516                                 scrub_page_put(sblock->pagev[page_index]);
1517                         }
1518                 }
1519                 kfree(sblocks_for_recheck);
1520         }
1521
1522         ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1523         if (ret < 0)
1524                 return ret;
1525         return 0;
1526 }
1527
1528 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1529 {
1530         if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1531                 return 2;
1532         else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1533                 return 3;
1534         else
1535                 return (int)bbio->num_stripes;
1536 }
1537
1538 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1539                                                  u64 *raid_map,
1540                                                  u64 mapped_length,
1541                                                  int nstripes, int mirror,
1542                                                  int *stripe_index,
1543                                                  u64 *stripe_offset)
1544 {
1545         int i;
1546
1547         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1548                 /* RAID5/6 */
1549                 for (i = 0; i < nstripes; i++) {
1550                         if (raid_map[i] == RAID6_Q_STRIPE ||
1551                             raid_map[i] == RAID5_P_STRIPE)
1552                                 continue;
1553
1554                         if (logical >= raid_map[i] &&
1555                             logical < raid_map[i] + mapped_length)
1556                                 break;
1557                 }
1558
1559                 *stripe_index = i;
1560                 *stripe_offset = logical - raid_map[i];
1561         } else {
1562                 /* The other RAID type */
1563                 *stripe_index = mirror;
1564                 *stripe_offset = 0;
1565         }
1566 }
1567
1568 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1569                                      struct scrub_block *sblocks_for_recheck)
1570 {
1571         struct scrub_ctx *sctx = original_sblock->sctx;
1572         struct btrfs_fs_info *fs_info = sctx->fs_info;
1573         u64 length = original_sblock->page_count * PAGE_SIZE;
1574         u64 logical = original_sblock->pagev[0]->logical;
1575         u64 generation = original_sblock->pagev[0]->generation;
1576         u64 flags = original_sblock->pagev[0]->flags;
1577         u64 have_csum = original_sblock->pagev[0]->have_csum;
1578         struct scrub_recover *recover;
1579         struct btrfs_bio *bbio;
1580         u64 sublen;
1581         u64 mapped_length;
1582         u64 stripe_offset;
1583         int stripe_index;
1584         int page_index = 0;
1585         int mirror_index;
1586         int nmirrors;
1587         int ret;
1588
1589         /*
1590          * note: the two members refs and outstanding_pages
1591          * are not used (and not set) in the blocks that are used for
1592          * the recheck procedure
1593          */
1594
1595         while (length > 0) {
1596                 sublen = min_t(u64, length, PAGE_SIZE);
1597                 mapped_length = sublen;
1598                 bbio = NULL;
1599
1600                 /*
1601                  * with a length of PAGE_SIZE, each returned stripe
1602                  * represents one mirror
1603                  */
1604                 btrfs_bio_counter_inc_blocked(fs_info);
1605                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1606                                 logical, &mapped_length, &bbio);
1607                 if (ret || !bbio || mapped_length < sublen) {
1608                         btrfs_put_bbio(bbio);
1609                         btrfs_bio_counter_dec(fs_info);
1610                         return -EIO;
1611                 }
1612
1613                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1614                 if (!recover) {
1615                         btrfs_put_bbio(bbio);
1616                         btrfs_bio_counter_dec(fs_info);
1617                         return -ENOMEM;
1618                 }
1619
1620                 refcount_set(&recover->refs, 1);
1621                 recover->bbio = bbio;
1622                 recover->map_length = mapped_length;
1623
1624                 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
1625
1626                 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1627
1628                 for (mirror_index = 0; mirror_index < nmirrors;
1629                      mirror_index++) {
1630                         struct scrub_block *sblock;
1631                         struct scrub_page *page;
1632
1633                         sblock = sblocks_for_recheck + mirror_index;
1634                         sblock->sctx = sctx;
1635
1636                         page = kzalloc(sizeof(*page), GFP_NOFS);
1637                         if (!page) {
1638 leave_nomem:
1639                                 spin_lock(&sctx->stat_lock);
1640                                 sctx->stat.malloc_errors++;
1641                                 spin_unlock(&sctx->stat_lock);
1642                                 scrub_put_recover(fs_info, recover);
1643                                 return -ENOMEM;
1644                         }
1645                         scrub_page_get(page);
1646                         sblock->pagev[page_index] = page;
1647                         page->sblock = sblock;
1648                         page->flags = flags;
1649                         page->generation = generation;
1650                         page->logical = logical;
1651                         page->have_csum = have_csum;
1652                         if (have_csum)
1653                                 memcpy(page->csum,
1654                                        original_sblock->pagev[0]->csum,
1655                                        sctx->csum_size);
1656
1657                         scrub_stripe_index_and_offset(logical,
1658                                                       bbio->map_type,
1659                                                       bbio->raid_map,
1660                                                       mapped_length,
1661                                                       bbio->num_stripes -
1662                                                       bbio->num_tgtdevs,
1663                                                       mirror_index,
1664                                                       &stripe_index,
1665                                                       &stripe_offset);
1666                         page->physical = bbio->stripes[stripe_index].physical +
1667                                          stripe_offset;
1668                         page->dev = bbio->stripes[stripe_index].dev;
1669
1670                         BUG_ON(page_index >= original_sblock->page_count);
1671                         page->physical_for_dev_replace =
1672                                 original_sblock->pagev[page_index]->
1673                                 physical_for_dev_replace;
1674                         /* for missing devices, dev->bdev is NULL */
1675                         page->mirror_num = mirror_index + 1;
1676                         sblock->page_count++;
1677                         page->page = alloc_page(GFP_NOFS);
1678                         if (!page->page)
1679                                 goto leave_nomem;
1680
1681                         scrub_get_recover(recover);
1682                         page->recover = recover;
1683                 }
1684                 scrub_put_recover(fs_info, recover);
1685                 length -= sublen;
1686                 logical += sublen;
1687                 page_index++;
1688         }
1689
1690         return 0;
1691 }
1692
1693 static void scrub_bio_wait_endio(struct bio *bio)
1694 {
1695         complete(bio->bi_private);
1696 }
1697
1698 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1699                                         struct bio *bio,
1700                                         struct scrub_page *page)
1701 {
1702         DECLARE_COMPLETION_ONSTACK(done);
1703         int ret;
1704         int mirror_num;
1705
1706         bio->bi_iter.bi_sector = page->logical >> 9;
1707         bio->bi_private = &done;
1708         bio->bi_end_io = scrub_bio_wait_endio;
1709
1710         mirror_num = page->sblock->pagev[0]->mirror_num;
1711         ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
1712                                     page->recover->map_length,
1713                                     mirror_num, 0);
1714         if (ret)
1715                 return ret;
1716
1717         wait_for_completion_io(&done);
1718         return blk_status_to_errno(bio->bi_status);
1719 }
1720
1721 /*
1722  * this function will check the on disk data for checksum errors, header
1723  * errors and read I/O errors. If any I/O errors happen, the exact pages
1724  * which are errored are marked as being bad. The goal is to enable scrub
1725  * to take those pages that are not errored from all the mirrors so that
1726  * the pages that are errored in the just handled mirror can be repaired.
1727  */
1728 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1729                                 struct scrub_block *sblock,
1730                                 int retry_failed_mirror)
1731 {
1732         int page_num;
1733
1734         sblock->no_io_error_seen = 1;
1735
1736         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1737                 struct bio *bio;
1738                 struct scrub_page *page = sblock->pagev[page_num];
1739
1740                 if (page->dev->bdev == NULL) {
1741                         page->io_error = 1;
1742                         sblock->no_io_error_seen = 0;
1743                         continue;
1744                 }
1745
1746                 WARN_ON(!page->page);
1747                 bio = btrfs_io_bio_alloc(1);
1748                 bio_set_dev(bio, page->dev->bdev);
1749
1750                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1751                 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
1752                         if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) {
1753                                 page->io_error = 1;
1754                                 sblock->no_io_error_seen = 0;
1755                         }
1756                 } else {
1757                         bio->bi_iter.bi_sector = page->physical >> 9;
1758                         bio_set_op_attrs(bio, REQ_OP_READ, 0);
1759
1760                         if (btrfsic_submit_bio_wait(bio)) {
1761                                 page->io_error = 1;
1762                                 sblock->no_io_error_seen = 0;
1763                         }
1764                 }
1765
1766                 bio_put(bio);
1767         }
1768
1769         if (sblock->no_io_error_seen)
1770                 scrub_recheck_block_checksum(sblock);
1771 }
1772
1773 static inline int scrub_check_fsid(u8 fsid[],
1774                                    struct scrub_page *spage)
1775 {
1776         struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1777         int ret;
1778
1779         ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1780         return !ret;
1781 }
1782
1783 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1784 {
1785         sblock->header_error = 0;
1786         sblock->checksum_error = 0;
1787         sblock->generation_error = 0;
1788
1789         if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1790                 scrub_checksum_data(sblock);
1791         else
1792                 scrub_checksum_tree_block(sblock);
1793 }
1794
1795 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1796                                              struct scrub_block *sblock_good)
1797 {
1798         int page_num;
1799         int ret = 0;
1800
1801         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1802                 int ret_sub;
1803
1804                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1805                                                            sblock_good,
1806                                                            page_num, 1);
1807                 if (ret_sub)
1808                         ret = ret_sub;
1809         }
1810
1811         return ret;
1812 }
1813
1814 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1815                                             struct scrub_block *sblock_good,
1816                                             int page_num, int force_write)
1817 {
1818         struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1819         struct scrub_page *page_good = sblock_good->pagev[page_num];
1820         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1821
1822         BUG_ON(page_bad->page == NULL);
1823         BUG_ON(page_good->page == NULL);
1824         if (force_write || sblock_bad->header_error ||
1825             sblock_bad->checksum_error || page_bad->io_error) {
1826                 struct bio *bio;
1827                 int ret;
1828
1829                 if (!page_bad->dev->bdev) {
1830                         btrfs_warn_rl(fs_info,
1831                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1832                         return -EIO;
1833                 }
1834
1835                 bio = btrfs_io_bio_alloc(1);
1836                 bio_set_dev(bio, page_bad->dev->bdev);
1837                 bio->bi_iter.bi_sector = page_bad->physical >> 9;
1838                 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
1839
1840                 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1841                 if (PAGE_SIZE != ret) {
1842                         bio_put(bio);
1843                         return -EIO;
1844                 }
1845
1846                 if (btrfsic_submit_bio_wait(bio)) {
1847                         btrfs_dev_stat_inc_and_print(page_bad->dev,
1848                                 BTRFS_DEV_STAT_WRITE_ERRS);
1849                         btrfs_dev_replace_stats_inc(
1850                                 &fs_info->dev_replace.num_write_errors);
1851                         bio_put(bio);
1852                         return -EIO;
1853                 }
1854                 bio_put(bio);
1855         }
1856
1857         return 0;
1858 }
1859
1860 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1861 {
1862         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1863         int page_num;
1864
1865         /*
1866          * This block is used for the check of the parity on the source device,
1867          * so the data needn't be written into the destination device.
1868          */
1869         if (sblock->sparity)
1870                 return;
1871
1872         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1873                 int ret;
1874
1875                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1876                 if (ret)
1877                         btrfs_dev_replace_stats_inc(
1878                                 &fs_info->dev_replace.num_write_errors);
1879         }
1880 }
1881
1882 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1883                                            int page_num)
1884 {
1885         struct scrub_page *spage = sblock->pagev[page_num];
1886
1887         BUG_ON(spage->page == NULL);
1888         if (spage->io_error) {
1889                 void *mapped_buffer = kmap_atomic(spage->page);
1890
1891                 clear_page(mapped_buffer);
1892                 flush_dcache_page(spage->page);
1893                 kunmap_atomic(mapped_buffer);
1894         }
1895         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1896 }
1897
1898 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1899                                     struct scrub_page *spage)
1900 {
1901         struct scrub_bio *sbio;
1902         int ret;
1903
1904         mutex_lock(&sctx->wr_lock);
1905 again:
1906         if (!sctx->wr_curr_bio) {
1907                 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1908                                               GFP_KERNEL);
1909                 if (!sctx->wr_curr_bio) {
1910                         mutex_unlock(&sctx->wr_lock);
1911                         return -ENOMEM;
1912                 }
1913                 sctx->wr_curr_bio->sctx = sctx;
1914                 sctx->wr_curr_bio->page_count = 0;
1915         }
1916         sbio = sctx->wr_curr_bio;
1917         if (sbio->page_count == 0) {
1918                 struct bio *bio;
1919
1920                 sbio->physical = spage->physical_for_dev_replace;
1921                 sbio->logical = spage->logical;
1922                 sbio->dev = sctx->wr_tgtdev;
1923                 bio = sbio->bio;
1924                 if (!bio) {
1925                         bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
1926                         sbio->bio = bio;
1927                 }
1928
1929                 bio->bi_private = sbio;
1930                 bio->bi_end_io = scrub_wr_bio_end_io;
1931                 bio_set_dev(bio, sbio->dev->bdev);
1932                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1933                 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
1934                 sbio->status = 0;
1935         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1936                    spage->physical_for_dev_replace ||
1937                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1938                    spage->logical) {
1939                 scrub_wr_submit(sctx);
1940                 goto again;
1941         }
1942
1943         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1944         if (ret != PAGE_SIZE) {
1945                 if (sbio->page_count < 1) {
1946                         bio_put(sbio->bio);
1947                         sbio->bio = NULL;
1948                         mutex_unlock(&sctx->wr_lock);
1949                         return -EIO;
1950                 }
1951                 scrub_wr_submit(sctx);
1952                 goto again;
1953         }
1954
1955         sbio->pagev[sbio->page_count] = spage;
1956         scrub_page_get(spage);
1957         sbio->page_count++;
1958         if (sbio->page_count == sctx->pages_per_wr_bio)
1959                 scrub_wr_submit(sctx);
1960         mutex_unlock(&sctx->wr_lock);
1961
1962         return 0;
1963 }
1964
1965 static void scrub_wr_submit(struct scrub_ctx *sctx)
1966 {
1967         struct scrub_bio *sbio;
1968
1969         if (!sctx->wr_curr_bio)
1970                 return;
1971
1972         sbio = sctx->wr_curr_bio;
1973         sctx->wr_curr_bio = NULL;
1974         WARN_ON(!sbio->bio->bi_disk);
1975         scrub_pending_bio_inc(sctx);
1976         /* process all writes in a single worker thread. Then the block layer
1977          * orders the requests before sending them to the driver which
1978          * doubled the write performance on spinning disks when measured
1979          * with Linux 3.5 */
1980         btrfsic_submit_bio(sbio->bio);
1981 }
1982
1983 static void scrub_wr_bio_end_io(struct bio *bio)
1984 {
1985         struct scrub_bio *sbio = bio->bi_private;
1986         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1987
1988         sbio->status = bio->bi_status;
1989         sbio->bio = bio;
1990
1991         btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1992                          scrub_wr_bio_end_io_worker, NULL, NULL);
1993         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1994 }
1995
1996 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1997 {
1998         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1999         struct scrub_ctx *sctx = sbio->sctx;
2000         int i;
2001
2002         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
2003         if (sbio->status) {
2004                 struct btrfs_dev_replace *dev_replace =
2005                         &sbio->sctx->fs_info->dev_replace;
2006
2007                 for (i = 0; i < sbio->page_count; i++) {
2008                         struct scrub_page *spage = sbio->pagev[i];
2009
2010                         spage->io_error = 1;
2011                         btrfs_dev_replace_stats_inc(&dev_replace->
2012                                                     num_write_errors);
2013                 }
2014         }
2015
2016         for (i = 0; i < sbio->page_count; i++)
2017                 scrub_page_put(sbio->pagev[i]);
2018
2019         bio_put(sbio->bio);
2020         kfree(sbio);
2021         scrub_pending_bio_dec(sctx);
2022 }
2023
2024 static int scrub_checksum(struct scrub_block *sblock)
2025 {
2026         u64 flags;
2027         int ret;
2028
2029         /*
2030          * No need to initialize these stats currently,
2031          * because this function only use return value
2032          * instead of these stats value.
2033          *
2034          * Todo:
2035          * always use stats
2036          */
2037         sblock->header_error = 0;
2038         sblock->generation_error = 0;
2039         sblock->checksum_error = 0;
2040
2041         WARN_ON(sblock->page_count < 1);
2042         flags = sblock->pagev[0]->flags;
2043         ret = 0;
2044         if (flags & BTRFS_EXTENT_FLAG_DATA)
2045                 ret = scrub_checksum_data(sblock);
2046         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
2047                 ret = scrub_checksum_tree_block(sblock);
2048         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
2049                 (void)scrub_checksum_super(sblock);
2050         else
2051                 WARN_ON(1);
2052         if (ret)
2053                 scrub_handle_errored_block(sblock);
2054
2055         return ret;
2056 }
2057
2058 static int scrub_checksum_data(struct scrub_block *sblock)
2059 {
2060         struct scrub_ctx *sctx = sblock->sctx;
2061         u8 csum[BTRFS_CSUM_SIZE];
2062         u8 *on_disk_csum;
2063         struct page *page;
2064         void *buffer;
2065         u32 crc = ~(u32)0;
2066         u64 len;
2067         int index;
2068
2069         BUG_ON(sblock->page_count < 1);
2070         if (!sblock->pagev[0]->have_csum)
2071                 return 0;
2072
2073         on_disk_csum = sblock->pagev[0]->csum;
2074         page = sblock->pagev[0]->page;
2075         buffer = kmap_atomic(page);
2076
2077         len = sctx->fs_info->sectorsize;
2078         index = 0;
2079         for (;;) {
2080                 u64 l = min_t(u64, len, PAGE_SIZE);
2081
2082                 crc = btrfs_csum_data(buffer, crc, l);
2083                 kunmap_atomic(buffer);
2084                 len -= l;
2085                 if (len == 0)
2086                         break;
2087                 index++;
2088                 BUG_ON(index >= sblock->page_count);
2089                 BUG_ON(!sblock->pagev[index]->page);
2090                 page = sblock->pagev[index]->page;
2091                 buffer = kmap_atomic(page);
2092         }
2093
2094         btrfs_csum_final(crc, csum);
2095         if (memcmp(csum, on_disk_csum, sctx->csum_size))
2096                 sblock->checksum_error = 1;
2097
2098         return sblock->checksum_error;
2099 }
2100
2101 static int scrub_checksum_tree_block(struct scrub_block *sblock)
2102 {
2103         struct scrub_ctx *sctx = sblock->sctx;
2104         struct btrfs_header *h;
2105         struct btrfs_fs_info *fs_info = sctx->fs_info;
2106         u8 calculated_csum[BTRFS_CSUM_SIZE];
2107         u8 on_disk_csum[BTRFS_CSUM_SIZE];
2108         struct page *page;
2109         void *mapped_buffer;
2110         u64 mapped_size;
2111         void *p;
2112         u32 crc = ~(u32)0;
2113         u64 len;
2114         int index;
2115
2116         BUG_ON(sblock->page_count < 1);
2117         page = sblock->pagev[0]->page;
2118         mapped_buffer = kmap_atomic(page);
2119         h = (struct btrfs_header *)mapped_buffer;
2120         memcpy(on_disk_csum, h->csum, sctx->csum_size);
2121
2122         /*
2123          * we don't use the getter functions here, as we
2124          * a) don't have an extent buffer and
2125          * b) the page is already kmapped
2126          */
2127         if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
2128                 sblock->header_error = 1;
2129
2130         if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
2131                 sblock->header_error = 1;
2132                 sblock->generation_error = 1;
2133         }
2134
2135         if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
2136                 sblock->header_error = 1;
2137
2138         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
2139                    BTRFS_UUID_SIZE))
2140                 sblock->header_error = 1;
2141
2142         len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
2143         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2144         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2145         index = 0;
2146         for (;;) {
2147                 u64 l = min_t(u64, len, mapped_size);
2148
2149                 crc = btrfs_csum_data(p, crc, l);
2150                 kunmap_atomic(mapped_buffer);
2151                 len -= l;
2152                 if (len == 0)
2153                         break;
2154                 index++;
2155                 BUG_ON(index >= sblock->page_count);
2156                 BUG_ON(!sblock->pagev[index]->page);
2157                 page = sblock->pagev[index]->page;
2158                 mapped_buffer = kmap_atomic(page);
2159                 mapped_size = PAGE_SIZE;
2160                 p = mapped_buffer;
2161         }
2162
2163         btrfs_csum_final(crc, calculated_csum);
2164         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2165                 sblock->checksum_error = 1;
2166
2167         return sblock->header_error || sblock->checksum_error;
2168 }
2169
2170 static int scrub_checksum_super(struct scrub_block *sblock)
2171 {
2172         struct btrfs_super_block *s;
2173         struct scrub_ctx *sctx = sblock->sctx;
2174         u8 calculated_csum[BTRFS_CSUM_SIZE];
2175         u8 on_disk_csum[BTRFS_CSUM_SIZE];
2176         struct page *page;
2177         void *mapped_buffer;
2178         u64 mapped_size;
2179         void *p;
2180         u32 crc = ~(u32)0;
2181         int fail_gen = 0;
2182         int fail_cor = 0;
2183         u64 len;
2184         int index;
2185
2186         BUG_ON(sblock->page_count < 1);
2187         page = sblock->pagev[0]->page;
2188         mapped_buffer = kmap_atomic(page);
2189         s = (struct btrfs_super_block *)mapped_buffer;
2190         memcpy(on_disk_csum, s->csum, sctx->csum_size);
2191
2192         if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
2193                 ++fail_cor;
2194
2195         if (sblock->pagev[0]->generation != btrfs_super_generation(s))
2196                 ++fail_gen;
2197
2198         if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
2199                 ++fail_cor;
2200
2201         len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
2202         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2203         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2204         index = 0;
2205         for (;;) {
2206                 u64 l = min_t(u64, len, mapped_size);
2207
2208                 crc = btrfs_csum_data(p, crc, l);
2209                 kunmap_atomic(mapped_buffer);
2210                 len -= l;
2211                 if (len == 0)
2212                         break;
2213                 index++;
2214                 BUG_ON(index >= sblock->page_count);
2215                 BUG_ON(!sblock->pagev[index]->page);
2216                 page = sblock->pagev[index]->page;
2217                 mapped_buffer = kmap_atomic(page);
2218                 mapped_size = PAGE_SIZE;
2219                 p = mapped_buffer;
2220         }
2221
2222         btrfs_csum_final(crc, calculated_csum);
2223         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2224                 ++fail_cor;
2225
2226         if (fail_cor + fail_gen) {
2227                 /*
2228                  * if we find an error in a super block, we just report it.
2229                  * They will get written with the next transaction commit
2230                  * anyway
2231                  */
2232                 spin_lock(&sctx->stat_lock);
2233                 ++sctx->stat.super_errors;
2234                 spin_unlock(&sctx->stat_lock);
2235                 if (fail_cor)
2236                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2237                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2238                 else
2239                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2240                                 BTRFS_DEV_STAT_GENERATION_ERRS);
2241         }
2242
2243         return fail_cor + fail_gen;
2244 }
2245
2246 static void scrub_block_get(struct scrub_block *sblock)
2247 {
2248         refcount_inc(&sblock->refs);
2249 }
2250
2251 static void scrub_block_put(struct scrub_block *sblock)
2252 {
2253         if (refcount_dec_and_test(&sblock->refs)) {
2254                 int i;
2255
2256                 if (sblock->sparity)
2257                         scrub_parity_put(sblock->sparity);
2258
2259                 for (i = 0; i < sblock->page_count; i++)
2260                         scrub_page_put(sblock->pagev[i]);
2261                 kfree(sblock);
2262         }
2263 }
2264
2265 static void scrub_page_get(struct scrub_page *spage)
2266 {
2267         atomic_inc(&spage->refs);
2268 }
2269
2270 static void scrub_page_put(struct scrub_page *spage)
2271 {
2272         if (atomic_dec_and_test(&spage->refs)) {
2273                 if (spage->page)
2274                         __free_page(spage->page);
2275                 kfree(spage);
2276         }
2277 }
2278
2279 static void scrub_submit(struct scrub_ctx *sctx)
2280 {
2281         struct scrub_bio *sbio;
2282
2283         if (sctx->curr == -1)
2284                 return;
2285
2286         sbio = sctx->bios[sctx->curr];
2287         sctx->curr = -1;
2288         scrub_pending_bio_inc(sctx);
2289         btrfsic_submit_bio(sbio->bio);
2290 }
2291
2292 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2293                                     struct scrub_page *spage)
2294 {
2295         struct scrub_block *sblock = spage->sblock;
2296         struct scrub_bio *sbio;
2297         int ret;
2298
2299 again:
2300         /*
2301          * grab a fresh bio or wait for one to become available
2302          */
2303         while (sctx->curr == -1) {
2304                 spin_lock(&sctx->list_lock);
2305                 sctx->curr = sctx->first_free;
2306                 if (sctx->curr != -1) {
2307                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2308                         sctx->bios[sctx->curr]->next_free = -1;
2309                         sctx->bios[sctx->curr]->page_count = 0;
2310                         spin_unlock(&sctx->list_lock);
2311                 } else {
2312                         spin_unlock(&sctx->list_lock);
2313                         wait_event(sctx->list_wait, sctx->first_free != -1);
2314                 }
2315         }
2316         sbio = sctx->bios[sctx->curr];
2317         if (sbio->page_count == 0) {
2318                 struct bio *bio;
2319
2320                 sbio->physical = spage->physical;
2321                 sbio->logical = spage->logical;
2322                 sbio->dev = spage->dev;
2323                 bio = sbio->bio;
2324                 if (!bio) {
2325                         bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
2326                         sbio->bio = bio;
2327                 }
2328
2329                 bio->bi_private = sbio;
2330                 bio->bi_end_io = scrub_bio_end_io;
2331                 bio_set_dev(bio, sbio->dev->bdev);
2332                 bio->bi_iter.bi_sector = sbio->physical >> 9;
2333                 bio_set_op_attrs(bio, REQ_OP_READ, 0);
2334                 sbio->status = 0;
2335         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2336                    spage->physical ||
2337                    sbio->logical + sbio->page_count * PAGE_SIZE !=
2338                    spage->logical ||
2339                    sbio->dev != spage->dev) {
2340                 scrub_submit(sctx);
2341                 goto again;
2342         }
2343
2344         sbio->pagev[sbio->page_count] = spage;
2345         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2346         if (ret != PAGE_SIZE) {
2347                 if (sbio->page_count < 1) {
2348                         bio_put(sbio->bio);
2349                         sbio->bio = NULL;
2350                         return -EIO;
2351                 }
2352                 scrub_submit(sctx);
2353                 goto again;
2354         }
2355
2356         scrub_block_get(sblock); /* one for the page added to the bio */
2357         atomic_inc(&sblock->outstanding_pages);
2358         sbio->page_count++;
2359         if (sbio->page_count == sctx->pages_per_rd_bio)
2360                 scrub_submit(sctx);
2361
2362         return 0;
2363 }
2364
2365 static void scrub_missing_raid56_end_io(struct bio *bio)
2366 {
2367         struct scrub_block *sblock = bio->bi_private;
2368         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2369
2370         if (bio->bi_status)
2371                 sblock->no_io_error_seen = 0;
2372
2373         bio_put(bio);
2374
2375         btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2376 }
2377
2378 static void scrub_missing_raid56_worker(struct btrfs_work *work)
2379 {
2380         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2381         struct scrub_ctx *sctx = sblock->sctx;
2382         struct btrfs_fs_info *fs_info = sctx->fs_info;
2383         u64 logical;
2384         struct btrfs_device *dev;
2385
2386         logical = sblock->pagev[0]->logical;
2387         dev = sblock->pagev[0]->dev;
2388
2389         if (sblock->no_io_error_seen)
2390                 scrub_recheck_block_checksum(sblock);
2391
2392         if (!sblock->no_io_error_seen) {
2393                 spin_lock(&sctx->stat_lock);
2394                 sctx->stat.read_errors++;
2395                 spin_unlock(&sctx->stat_lock);
2396                 btrfs_err_rl_in_rcu(fs_info,
2397                         "IO error rebuilding logical %llu for dev %s",
2398                         logical, rcu_str_deref(dev->name));
2399         } else if (sblock->header_error || sblock->checksum_error) {
2400                 spin_lock(&sctx->stat_lock);
2401                 sctx->stat.uncorrectable_errors++;
2402                 spin_unlock(&sctx->stat_lock);
2403                 btrfs_err_rl_in_rcu(fs_info,
2404                         "failed to rebuild valid logical %llu for dev %s",
2405                         logical, rcu_str_deref(dev->name));
2406         } else {
2407                 scrub_write_block_to_dev_replace(sblock);
2408         }
2409
2410         scrub_block_put(sblock);
2411
2412         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2413                 mutex_lock(&sctx->wr_lock);
2414                 scrub_wr_submit(sctx);
2415                 mutex_unlock(&sctx->wr_lock);
2416         }
2417
2418         scrub_pending_bio_dec(sctx);
2419 }
2420
2421 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2422 {
2423         struct scrub_ctx *sctx = sblock->sctx;
2424         struct btrfs_fs_info *fs_info = sctx->fs_info;
2425         u64 length = sblock->page_count * PAGE_SIZE;
2426         u64 logical = sblock->pagev[0]->logical;
2427         struct btrfs_bio *bbio = NULL;
2428         struct bio *bio;
2429         struct btrfs_raid_bio *rbio;
2430         int ret;
2431         int i;
2432
2433         btrfs_bio_counter_inc_blocked(fs_info);
2434         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2435                         &length, &bbio);
2436         if (ret || !bbio || !bbio->raid_map)
2437                 goto bbio_out;
2438
2439         if (WARN_ON(!sctx->is_dev_replace ||
2440                     !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2441                 /*
2442                  * We shouldn't be scrubbing a missing device. Even for dev
2443                  * replace, we should only get here for RAID 5/6. We either
2444                  * managed to mount something with no mirrors remaining or
2445                  * there's a bug in scrub_remap_extent()/btrfs_map_block().
2446                  */
2447                 goto bbio_out;
2448         }
2449
2450         bio = btrfs_io_bio_alloc(0);
2451         bio->bi_iter.bi_sector = logical >> 9;
2452         bio->bi_private = sblock;
2453         bio->bi_end_io = scrub_missing_raid56_end_io;
2454
2455         rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
2456         if (!rbio)
2457                 goto rbio_out;
2458
2459         for (i = 0; i < sblock->page_count; i++) {
2460                 struct scrub_page *spage = sblock->pagev[i];
2461
2462                 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2463         }
2464
2465         btrfs_init_work(&sblock->work, btrfs_scrub_helper,
2466                         scrub_missing_raid56_worker, NULL, NULL);
2467         scrub_block_get(sblock);
2468         scrub_pending_bio_inc(sctx);
2469         raid56_submit_missing_rbio(rbio);
2470         return;
2471
2472 rbio_out:
2473         bio_put(bio);
2474 bbio_out:
2475         btrfs_bio_counter_dec(fs_info);
2476         btrfs_put_bbio(bbio);
2477         spin_lock(&sctx->stat_lock);
2478         sctx->stat.malloc_errors++;
2479         spin_unlock(&sctx->stat_lock);
2480 }
2481
2482 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2483                        u64 physical, struct btrfs_device *dev, u64 flags,
2484                        u64 gen, int mirror_num, u8 *csum, int force,
2485                        u64 physical_for_dev_replace)
2486 {
2487         struct scrub_block *sblock;
2488         int index;
2489
2490         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2491         if (!sblock) {
2492                 spin_lock(&sctx->stat_lock);
2493                 sctx->stat.malloc_errors++;
2494                 spin_unlock(&sctx->stat_lock);
2495                 return -ENOMEM;
2496         }
2497
2498         /* one ref inside this function, plus one for each page added to
2499          * a bio later on */
2500         refcount_set(&sblock->refs, 1);
2501         sblock->sctx = sctx;
2502         sblock->no_io_error_seen = 1;
2503
2504         for (index = 0; len > 0; index++) {
2505                 struct scrub_page *spage;
2506                 u64 l = min_t(u64, len, PAGE_SIZE);
2507
2508                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2509                 if (!spage) {
2510 leave_nomem:
2511                         spin_lock(&sctx->stat_lock);
2512                         sctx->stat.malloc_errors++;
2513                         spin_unlock(&sctx->stat_lock);
2514                         scrub_block_put(sblock);
2515                         return -ENOMEM;
2516                 }
2517                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2518                 scrub_page_get(spage);
2519                 sblock->pagev[index] = spage;
2520                 spage->sblock = sblock;
2521                 spage->dev = dev;
2522                 spage->flags = flags;
2523                 spage->generation = gen;
2524                 spage->logical = logical;
2525                 spage->physical = physical;
2526                 spage->physical_for_dev_replace = physical_for_dev_replace;
2527                 spage->mirror_num = mirror_num;
2528                 if (csum) {
2529                         spage->have_csum = 1;
2530                         memcpy(spage->csum, csum, sctx->csum_size);
2531                 } else {
2532                         spage->have_csum = 0;
2533                 }
2534                 sblock->page_count++;
2535                 spage->page = alloc_page(GFP_KERNEL);
2536                 if (!spage->page)
2537                         goto leave_nomem;
2538                 len -= l;
2539                 logical += l;
2540                 physical += l;
2541                 physical_for_dev_replace += l;
2542         }
2543
2544         WARN_ON(sblock->page_count == 0);
2545         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2546                 /*
2547                  * This case should only be hit for RAID 5/6 device replace. See
2548                  * the comment in scrub_missing_raid56_pages() for details.
2549                  */
2550                 scrub_missing_raid56_pages(sblock);
2551         } else {
2552                 for (index = 0; index < sblock->page_count; index++) {
2553                         struct scrub_page *spage = sblock->pagev[index];
2554                         int ret;
2555
2556                         ret = scrub_add_page_to_rd_bio(sctx, spage);
2557                         if (ret) {
2558                                 scrub_block_put(sblock);
2559                                 return ret;
2560                         }
2561                 }
2562
2563                 if (force)
2564                         scrub_submit(sctx);
2565         }
2566
2567         /* last one frees, either here or in bio completion for last page */
2568         scrub_block_put(sblock);
2569         return 0;
2570 }
2571
2572 static void scrub_bio_end_io(struct bio *bio)
2573 {
2574         struct scrub_bio *sbio = bio->bi_private;
2575         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2576
2577         sbio->status = bio->bi_status;
2578         sbio->bio = bio;
2579
2580         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2581 }
2582
2583 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2584 {
2585         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2586         struct scrub_ctx *sctx = sbio->sctx;
2587         int i;
2588
2589         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2590         if (sbio->status) {
2591                 for (i = 0; i < sbio->page_count; i++) {
2592                         struct scrub_page *spage = sbio->pagev[i];
2593
2594                         spage->io_error = 1;
2595                         spage->sblock->no_io_error_seen = 0;
2596                 }
2597         }
2598
2599         /* now complete the scrub_block items that have all pages completed */
2600         for (i = 0; i < sbio->page_count; i++) {
2601                 struct scrub_page *spage = sbio->pagev[i];
2602                 struct scrub_block *sblock = spage->sblock;
2603
2604                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2605                         scrub_block_complete(sblock);
2606                 scrub_block_put(sblock);
2607         }
2608
2609         bio_put(sbio->bio);
2610         sbio->bio = NULL;
2611         spin_lock(&sctx->list_lock);
2612         sbio->next_free = sctx->first_free;
2613         sctx->first_free = sbio->index;
2614         spin_unlock(&sctx->list_lock);
2615
2616         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2617                 mutex_lock(&sctx->wr_lock);
2618                 scrub_wr_submit(sctx);
2619                 mutex_unlock(&sctx->wr_lock);
2620         }
2621
2622         scrub_pending_bio_dec(sctx);
2623 }
2624
2625 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2626                                        unsigned long *bitmap,
2627                                        u64 start, u64 len)
2628 {
2629         u64 offset;
2630         u64 nsectors64;
2631         u32 nsectors;
2632         int sectorsize = sparity->sctx->fs_info->sectorsize;
2633
2634         if (len >= sparity->stripe_len) {
2635                 bitmap_set(bitmap, 0, sparity->nsectors);
2636                 return;
2637         }
2638
2639         start -= sparity->logic_start;
2640         start = div64_u64_rem(start, sparity->stripe_len, &offset);
2641         offset = div_u64(offset, sectorsize);
2642         nsectors64 = div_u64(len, sectorsize);
2643
2644         ASSERT(nsectors64 < UINT_MAX);
2645         nsectors = (u32)nsectors64;
2646
2647         if (offset + nsectors <= sparity->nsectors) {
2648                 bitmap_set(bitmap, offset, nsectors);
2649                 return;
2650         }
2651
2652         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2653         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2654 }
2655
2656 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2657                                                    u64 start, u64 len)
2658 {
2659         __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2660 }
2661
2662 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2663                                                   u64 start, u64 len)
2664 {
2665         __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2666 }
2667
2668 static void scrub_block_complete(struct scrub_block *sblock)
2669 {
2670         int corrupted = 0;
2671
2672         if (!sblock->no_io_error_seen) {
2673                 corrupted = 1;
2674                 scrub_handle_errored_block(sblock);
2675         } else {
2676                 /*
2677                  * if has checksum error, write via repair mechanism in
2678                  * dev replace case, otherwise write here in dev replace
2679                  * case.
2680                  */
2681                 corrupted = scrub_checksum(sblock);
2682                 if (!corrupted && sblock->sctx->is_dev_replace)
2683                         scrub_write_block_to_dev_replace(sblock);
2684         }
2685
2686         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2687                 u64 start = sblock->pagev[0]->logical;
2688                 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2689                           PAGE_SIZE;
2690
2691                 scrub_parity_mark_sectors_error(sblock->sparity,
2692                                                 start, end - start);
2693         }
2694 }
2695
2696 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2697 {
2698         struct btrfs_ordered_sum *sum = NULL;
2699         unsigned long index;
2700         unsigned long num_sectors;
2701
2702         while (!list_empty(&sctx->csum_list)) {
2703                 sum = list_first_entry(&sctx->csum_list,
2704                                        struct btrfs_ordered_sum, list);
2705                 if (sum->bytenr > logical)
2706                         return 0;
2707                 if (sum->bytenr + sum->len > logical)
2708                         break;
2709
2710                 ++sctx->stat.csum_discards;
2711                 list_del(&sum->list);
2712                 kfree(sum);
2713                 sum = NULL;
2714         }
2715         if (!sum)
2716                 return 0;
2717
2718         index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize);
2719         ASSERT(index < UINT_MAX);
2720
2721         num_sectors = sum->len / sctx->fs_info->sectorsize;
2722         memcpy(csum, sum->sums + index, sctx->csum_size);
2723         if (index == num_sectors - 1) {
2724                 list_del(&sum->list);
2725                 kfree(sum);
2726         }
2727         return 1;
2728 }
2729
2730 /* scrub extent tries to collect up to 64 kB for each bio */
2731 static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2732                         u64 physical, struct btrfs_device *dev, u64 flags,
2733                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2734 {
2735         int ret;
2736         u8 csum[BTRFS_CSUM_SIZE];
2737         u32 blocksize;
2738
2739         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2740                 blocksize = sctx->fs_info->sectorsize;
2741                 spin_lock(&sctx->stat_lock);
2742                 sctx->stat.data_extents_scrubbed++;
2743                 sctx->stat.data_bytes_scrubbed += len;
2744                 spin_unlock(&sctx->stat_lock);
2745         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2746                 blocksize = sctx->fs_info->nodesize;
2747                 spin_lock(&sctx->stat_lock);
2748                 sctx->stat.tree_extents_scrubbed++;
2749                 sctx->stat.tree_bytes_scrubbed += len;
2750                 spin_unlock(&sctx->stat_lock);
2751         } else {
2752                 blocksize = sctx->fs_info->sectorsize;
2753                 WARN_ON(1);
2754         }
2755
2756         while (len) {
2757                 u64 l = min_t(u64, len, blocksize);
2758                 int have_csum = 0;
2759
2760                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2761                         /* push csums to sbio */
2762                         have_csum = scrub_find_csum(sctx, logical, csum);
2763                         if (have_csum == 0)
2764                                 ++sctx->stat.no_csum;
2765                         if (sctx->is_dev_replace && !have_csum) {
2766                                 ret = copy_nocow_pages(sctx, logical, l,
2767                                                        mirror_num,
2768                                                       physical_for_dev_replace);
2769                                 goto behind_scrub_pages;
2770                         }
2771                 }
2772                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2773                                   mirror_num, have_csum ? csum : NULL, 0,
2774                                   physical_for_dev_replace);
2775 behind_scrub_pages:
2776                 if (ret)
2777                         return ret;
2778                 len -= l;
2779                 logical += l;
2780                 physical += l;
2781                 physical_for_dev_replace += l;
2782         }
2783         return 0;
2784 }
2785
2786 static int scrub_pages_for_parity(struct scrub_parity *sparity,
2787                                   u64 logical, u64 len,
2788                                   u64 physical, struct btrfs_device *dev,
2789                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2790 {
2791         struct scrub_ctx *sctx = sparity->sctx;
2792         struct scrub_block *sblock;
2793         int index;
2794
2795         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2796         if (!sblock) {
2797                 spin_lock(&sctx->stat_lock);
2798                 sctx->stat.malloc_errors++;
2799                 spin_unlock(&sctx->stat_lock);
2800                 return -ENOMEM;
2801         }
2802
2803         /* one ref inside this function, plus one for each page added to
2804          * a bio later on */
2805         refcount_set(&sblock->refs, 1);
2806         sblock->sctx = sctx;
2807         sblock->no_io_error_seen = 1;
2808         sblock->sparity = sparity;
2809         scrub_parity_get(sparity);
2810
2811         for (index = 0; len > 0; index++) {
2812                 struct scrub_page *spage;
2813                 u64 l = min_t(u64, len, PAGE_SIZE);
2814
2815                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2816                 if (!spage) {
2817 leave_nomem:
2818                         spin_lock(&sctx->stat_lock);
2819                         sctx->stat.malloc_errors++;
2820                         spin_unlock(&sctx->stat_lock);
2821                         scrub_block_put(sblock);
2822                         return -ENOMEM;
2823                 }
2824                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2825                 /* For scrub block */
2826                 scrub_page_get(spage);
2827                 sblock->pagev[index] = spage;
2828                 /* For scrub parity */
2829                 scrub_page_get(spage);
2830                 list_add_tail(&spage->list, &sparity->spages);
2831                 spage->sblock = sblock;
2832                 spage->dev = dev;
2833                 spage->flags = flags;
2834                 spage->generation = gen;
2835                 spage->logical = logical;
2836                 spage->physical = physical;
2837                 spage->mirror_num = mirror_num;
2838                 if (csum) {
2839                         spage->have_csum = 1;
2840                         memcpy(spage->csum, csum, sctx->csum_size);
2841                 } else {
2842                         spage->have_csum = 0;
2843                 }
2844                 sblock->page_count++;
2845                 spage->page = alloc_page(GFP_KERNEL);
2846                 if (!spage->page)
2847                         goto leave_nomem;
2848                 len -= l;
2849                 logical += l;
2850                 physical += l;
2851         }
2852
2853         WARN_ON(sblock->page_count == 0);
2854         for (index = 0; index < sblock->page_count; index++) {
2855                 struct scrub_page *spage = sblock->pagev[index];
2856                 int ret;
2857
2858                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2859                 if (ret) {
2860                         scrub_block_put(sblock);
2861                         return ret;
2862                 }
2863         }
2864
2865         /* last one frees, either here or in bio completion for last page */
2866         scrub_block_put(sblock);
2867         return 0;
2868 }
2869
2870 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2871                                    u64 logical, u64 len,
2872                                    u64 physical, struct btrfs_device *dev,
2873                                    u64 flags, u64 gen, int mirror_num)
2874 {
2875         struct scrub_ctx *sctx = sparity->sctx;
2876         int ret;
2877         u8 csum[BTRFS_CSUM_SIZE];
2878         u32 blocksize;
2879
2880         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2881                 scrub_parity_mark_sectors_error(sparity, logical, len);
2882                 return 0;
2883         }
2884
2885         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2886                 blocksize = sctx->fs_info->sectorsize;
2887         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2888                 blocksize = sctx->fs_info->nodesize;
2889         } else {
2890                 blocksize = sctx->fs_info->sectorsize;
2891                 WARN_ON(1);
2892         }
2893
2894         while (len) {
2895                 u64 l = min_t(u64, len, blocksize);
2896                 int have_csum = 0;
2897
2898                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2899                         /* push csums to sbio */
2900                         have_csum = scrub_find_csum(sctx, logical, csum);
2901                         if (have_csum == 0)
2902                                 goto skip;
2903                 }
2904                 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2905                                              flags, gen, mirror_num,
2906                                              have_csum ? csum : NULL);
2907                 if (ret)
2908                         return ret;
2909 skip:
2910                 len -= l;
2911                 logical += l;
2912                 physical += l;
2913         }
2914         return 0;
2915 }
2916
2917 /*
2918  * Given a physical address, this will calculate it's
2919  * logical offset. if this is a parity stripe, it will return
2920  * the most left data stripe's logical offset.
2921  *
2922  * return 0 if it is a data stripe, 1 means parity stripe.
2923  */
2924 static int get_raid56_logic_offset(u64 physical, int num,
2925                                    struct map_lookup *map, u64 *offset,
2926                                    u64 *stripe_start)
2927 {
2928         int i;
2929         int j = 0;
2930         u64 stripe_nr;
2931         u64 last_offset;
2932         u32 stripe_index;
2933         u32 rot;
2934
2935         last_offset = (physical - map->stripes[num].physical) *
2936                       nr_data_stripes(map);
2937         if (stripe_start)
2938                 *stripe_start = last_offset;
2939
2940         *offset = last_offset;
2941         for (i = 0; i < nr_data_stripes(map); i++) {
2942                 *offset = last_offset + i * map->stripe_len;
2943
2944                 stripe_nr = div64_u64(*offset, map->stripe_len);
2945                 stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
2946
2947                 /* Work out the disk rotation on this stripe-set */
2948                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2949                 /* calculate which stripe this data locates */
2950                 rot += i;
2951                 stripe_index = rot % map->num_stripes;
2952                 if (stripe_index == num)
2953                         return 0;
2954                 if (stripe_index < num)
2955                         j++;
2956         }
2957         *offset = last_offset + j * map->stripe_len;
2958         return 1;
2959 }
2960
2961 static void scrub_free_parity(struct scrub_parity *sparity)
2962 {
2963         struct scrub_ctx *sctx = sparity->sctx;
2964         struct scrub_page *curr, *next;
2965         int nbits;
2966
2967         nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2968         if (nbits) {
2969                 spin_lock(&sctx->stat_lock);
2970                 sctx->stat.read_errors += nbits;
2971                 sctx->stat.uncorrectable_errors += nbits;
2972                 spin_unlock(&sctx->stat_lock);
2973         }
2974
2975         list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2976                 list_del_init(&curr->list);
2977                 scrub_page_put(curr);
2978         }
2979
2980         kfree(sparity);
2981 }
2982
2983 static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2984 {
2985         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2986                                                     work);
2987         struct scrub_ctx *sctx = sparity->sctx;
2988
2989         scrub_free_parity(sparity);
2990         scrub_pending_bio_dec(sctx);
2991 }
2992
2993 static void scrub_parity_bio_endio(struct bio *bio)
2994 {
2995         struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2996         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2997
2998         if (bio->bi_status)
2999                 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
3000                           sparity->nsectors);
3001
3002         bio_put(bio);
3003
3004         btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
3005                         scrub_parity_bio_endio_worker, NULL, NULL);
3006         btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
3007 }
3008
3009 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
3010 {
3011         struct scrub_ctx *sctx = sparity->sctx;
3012         struct btrfs_fs_info *fs_info = sctx->fs_info;
3013         struct bio *bio;
3014         struct btrfs_raid_bio *rbio;
3015         struct btrfs_bio *bbio = NULL;
3016         u64 length;
3017         int ret;
3018
3019         if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
3020                            sparity->nsectors))
3021                 goto out;
3022
3023         length = sparity->logic_end - sparity->logic_start;
3024
3025         btrfs_bio_counter_inc_blocked(fs_info);
3026         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
3027                                &length, &bbio);
3028         if (ret || !bbio || !bbio->raid_map)
3029                 goto bbio_out;
3030
3031         bio = btrfs_io_bio_alloc(0);
3032         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
3033         bio->bi_private = sparity;
3034         bio->bi_end_io = scrub_parity_bio_endio;
3035
3036         rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
3037                                               length, sparity->scrub_dev,
3038                                               sparity->dbitmap,
3039                                               sparity->nsectors);
3040         if (!rbio)
3041                 goto rbio_out;
3042
3043         scrub_pending_bio_inc(sctx);
3044         raid56_parity_submit_scrub_rbio(rbio);
3045         return;
3046
3047 rbio_out:
3048         bio_put(bio);
3049 bbio_out:
3050         btrfs_bio_counter_dec(fs_info);
3051         btrfs_put_bbio(bbio);
3052         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
3053                   sparity->nsectors);
3054         spin_lock(&sctx->stat_lock);
3055         sctx->stat.malloc_errors++;
3056         spin_unlock(&sctx->stat_lock);
3057 out:
3058         scrub_free_parity(sparity);
3059 }
3060
3061 static inline int scrub_calc_parity_bitmap_len(int nsectors)
3062 {
3063         return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
3064 }
3065
3066 static void scrub_parity_get(struct scrub_parity *sparity)
3067 {
3068         refcount_inc(&sparity->refs);
3069 }
3070
3071 static void scrub_parity_put(struct scrub_parity *sparity)
3072 {
3073         if (!refcount_dec_and_test(&sparity->refs))
3074                 return;
3075
3076         scrub_parity_check_and_repair(sparity);
3077 }
3078
3079 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3080                                                   struct map_lookup *map,
3081                                                   struct btrfs_device *sdev,
3082                                                   struct btrfs_path *path,
3083                                                   u64 logic_start,
3084                                                   u64 logic_end)
3085 {
3086         struct btrfs_fs_info *fs_info = sctx->fs_info;
3087         struct btrfs_root *root = fs_info->extent_root;
3088         struct btrfs_root *csum_root = fs_info->csum_root;
3089         struct btrfs_extent_item *extent;
3090         struct btrfs_bio *bbio = NULL;
3091         u64 flags;
3092         int ret;
3093         int slot;
3094         struct extent_buffer *l;
3095         struct btrfs_key key;
3096         u64 generation;
3097         u64 extent_logical;
3098         u64 extent_physical;
3099         u64 extent_len;
3100         u64 mapped_length;
3101         struct btrfs_device *extent_dev;
3102         struct scrub_parity *sparity;
3103         int nsectors;
3104         int bitmap_len;
3105         int extent_mirror_num;
3106         int stop_loop = 0;
3107
3108         nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
3109         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
3110         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
3111                           GFP_NOFS);
3112         if (!sparity) {
3113                 spin_lock(&sctx->stat_lock);
3114                 sctx->stat.malloc_errors++;
3115                 spin_unlock(&sctx->stat_lock);
3116                 return -ENOMEM;
3117         }
3118
3119         sparity->stripe_len = map->stripe_len;
3120         sparity->nsectors = nsectors;
3121         sparity->sctx = sctx;
3122         sparity->scrub_dev = sdev;
3123         sparity->logic_start = logic_start;
3124         sparity->logic_end = logic_end;
3125         refcount_set(&sparity->refs, 1);
3126         INIT_LIST_HEAD(&sparity->spages);
3127         sparity->dbitmap = sparity->bitmap;
3128         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
3129
3130         ret = 0;
3131         while (logic_start < logic_end) {
3132                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3133                         key.type = BTRFS_METADATA_ITEM_KEY;
3134                 else
3135                         key.type = BTRFS_EXTENT_ITEM_KEY;
3136                 key.objectid = logic_start;
3137                 key.offset = (u64)-1;
3138
3139                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3140                 if (ret < 0)
3141                         goto out;
3142
3143                 if (ret > 0) {
3144                         ret = btrfs_previous_extent_item(root, path, 0);
3145                         if (ret < 0)
3146                                 goto out;
3147                         if (ret > 0) {
3148                                 btrfs_release_path(path);
3149                                 ret = btrfs_search_slot(NULL, root, &key,
3150                                                         path, 0, 0);
3151                                 if (ret < 0)
3152                                         goto out;
3153                         }
3154                 }
3155
3156                 stop_loop = 0;
3157                 while (1) {
3158                         u64 bytes;
3159
3160                         l = path->nodes[0];
3161                         slot = path->slots[0];
3162                         if (slot >= btrfs_header_nritems(l)) {
3163                                 ret = btrfs_next_leaf(root, path);
3164                                 if (ret == 0)
3165                                         continue;
3166                                 if (ret < 0)
3167                                         goto out;
3168
3169                                 stop_loop = 1;
3170                                 break;
3171                         }
3172                         btrfs_item_key_to_cpu(l, &key, slot);
3173
3174                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3175                             key.type != BTRFS_METADATA_ITEM_KEY)
3176                                 goto next;
3177
3178                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3179                                 bytes = fs_info->nodesize;
3180                         else
3181                                 bytes = key.offset;
3182
3183                         if (key.objectid + bytes <= logic_start)
3184                                 goto next;
3185
3186                         if (key.objectid >= logic_end) {
3187                                 stop_loop = 1;
3188                                 break;
3189                         }
3190
3191                         while (key.objectid >= logic_start + map->stripe_len)
3192                                 logic_start += map->stripe_len;
3193
3194                         extent = btrfs_item_ptr(l, slot,
3195                                                 struct btrfs_extent_item);
3196                         flags = btrfs_extent_flags(l, extent);
3197                         generation = btrfs_extent_generation(l, extent);
3198
3199                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3200                             (key.objectid < logic_start ||
3201                              key.objectid + bytes >
3202                              logic_start + map->stripe_len)) {
3203                                 btrfs_err(fs_info,
3204                                           "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3205                                           key.objectid, logic_start);
3206                                 spin_lock(&sctx->stat_lock);
3207                                 sctx->stat.uncorrectable_errors++;
3208                                 spin_unlock(&sctx->stat_lock);
3209                                 goto next;
3210                         }
3211 again:
3212                         extent_logical = key.objectid;
3213                         extent_len = bytes;
3214
3215                         if (extent_logical < logic_start) {
3216                                 extent_len -= logic_start - extent_logical;
3217                                 extent_logical = logic_start;
3218                         }
3219
3220                         if (extent_logical + extent_len >
3221                             logic_start + map->stripe_len)
3222                                 extent_len = logic_start + map->stripe_len -
3223                                              extent_logical;
3224
3225                         scrub_parity_mark_sectors_data(sparity, extent_logical,
3226                                                        extent_len);
3227
3228                         mapped_length = extent_len;
3229                         bbio = NULL;
3230                         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
3231                                         extent_logical, &mapped_length, &bbio,
3232                                         0);
3233                         if (!ret) {
3234                                 if (!bbio || mapped_length < extent_len)
3235                                         ret = -EIO;
3236                         }
3237                         if (ret) {
3238                                 btrfs_put_bbio(bbio);
3239                                 goto out;
3240                         }
3241                         extent_physical = bbio->stripes[0].physical;
3242                         extent_mirror_num = bbio->mirror_num;
3243                         extent_dev = bbio->stripes[0].dev;
3244                         btrfs_put_bbio(bbio);
3245
3246                         ret = btrfs_lookup_csums_range(csum_root,
3247                                                 extent_logical,
3248                                                 extent_logical + extent_len - 1,
3249                                                 &sctx->csum_list, 1);
3250                         if (ret)
3251                                 goto out;
3252
3253                         ret = scrub_extent_for_parity(sparity, extent_logical,
3254                                                       extent_len,
3255                                                       extent_physical,
3256                                                       extent_dev, flags,
3257                                                       generation,
3258                                                       extent_mirror_num);
3259
3260                         scrub_free_csums(sctx);
3261
3262                         if (ret)
3263                                 goto out;
3264
3265                         if (extent_logical + extent_len <
3266                             key.objectid + bytes) {
3267                                 logic_start += map->stripe_len;
3268
3269                                 if (logic_start >= logic_end) {
3270                                         stop_loop = 1;
3271                                         break;
3272                                 }
3273
3274                                 if (logic_start < key.objectid + bytes) {
3275                                         cond_resched();
3276                                         goto again;
3277                                 }
3278                         }
3279 next:
3280                         path->slots[0]++;
3281                 }
3282
3283                 btrfs_release_path(path);
3284
3285                 if (stop_loop)
3286                         break;
3287
3288                 logic_start += map->stripe_len;
3289         }
3290 out:
3291         if (ret < 0)
3292                 scrub_parity_mark_sectors_error(sparity, logic_start,
3293                                                 logic_end - logic_start);
3294         scrub_parity_put(sparity);
3295         scrub_submit(sctx);
3296         mutex_lock(&sctx->wr_lock);
3297         scrub_wr_submit(sctx);
3298         mutex_unlock(&sctx->wr_lock);
3299
3300         btrfs_release_path(path);
3301         return ret < 0 ? ret : 0;
3302 }
3303
3304 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3305                                            struct map_lookup *map,
3306                                            struct btrfs_device *scrub_dev,
3307                                            int num, u64 base, u64 length,
3308                                            int is_dev_replace)
3309 {
3310         struct btrfs_path *path, *ppath;
3311         struct btrfs_fs_info *fs_info = sctx->fs_info;
3312         struct btrfs_root *root = fs_info->extent_root;
3313         struct btrfs_root *csum_root = fs_info->csum_root;
3314         struct btrfs_extent_item *extent;
3315         struct blk_plug plug;
3316         u64 flags;
3317         int ret;
3318         int slot;
3319         u64 nstripes;
3320         struct extent_buffer *l;
3321         u64 physical;
3322         u64 logical;
3323         u64 logic_end;
3324         u64 physical_end;
3325         u64 generation;
3326         int mirror_num;
3327         struct reada_control *reada1;
3328         struct reada_control *reada2;
3329         struct btrfs_key key;
3330         struct btrfs_key key_end;
3331         u64 increment = map->stripe_len;
3332         u64 offset;
3333         u64 extent_logical;
3334         u64 extent_physical;
3335         u64 extent_len;
3336         u64 stripe_logical;
3337         u64 stripe_end;
3338         struct btrfs_device *extent_dev;
3339         int extent_mirror_num;
3340         int stop_loop = 0;
3341
3342         physical = map->stripes[num].physical;
3343         offset = 0;
3344         nstripes = div64_u64(length, map->stripe_len);
3345         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3346                 offset = map->stripe_len * num;
3347                 increment = map->stripe_len * map->num_stripes;
3348                 mirror_num = 1;
3349         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3350                 int factor = map->num_stripes / map->sub_stripes;
3351                 offset = map->stripe_len * (num / map->sub_stripes);
3352                 increment = map->stripe_len * factor;
3353                 mirror_num = num % map->sub_stripes + 1;
3354         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3355                 increment = map->stripe_len;
3356                 mirror_num = num % map->num_stripes + 1;
3357         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3358                 increment = map->stripe_len;
3359                 mirror_num = num % map->num_stripes + 1;
3360         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3361                 get_raid56_logic_offset(physical, num, map, &offset, NULL);
3362                 increment = map->stripe_len * nr_data_stripes(map);
3363                 mirror_num = 1;
3364         } else {
3365                 increment = map->stripe_len;
3366                 mirror_num = 1;
3367         }
3368
3369         path = btrfs_alloc_path();
3370         if (!path)
3371                 return -ENOMEM;
3372
3373         ppath = btrfs_alloc_path();
3374         if (!ppath) {
3375                 btrfs_free_path(path);
3376                 return -ENOMEM;
3377         }
3378
3379         /*
3380          * work on commit root. The related disk blocks are static as
3381          * long as COW is applied. This means, it is save to rewrite
3382          * them to repair disk errors without any race conditions
3383          */
3384         path->search_commit_root = 1;
3385         path->skip_locking = 1;
3386
3387         ppath->search_commit_root = 1;
3388         ppath->skip_locking = 1;
3389         /*
3390          * trigger the readahead for extent tree csum tree and wait for
3391          * completion. During readahead, the scrub is officially paused
3392          * to not hold off transaction commits
3393          */
3394         logical = base + offset;
3395         physical_end = physical + nstripes * map->stripe_len;
3396         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3397                 get_raid56_logic_offset(physical_end, num,
3398                                         map, &logic_end, NULL);
3399                 logic_end += base;
3400         } else {
3401                 logic_end = logical + increment * nstripes;
3402         }
3403         wait_event(sctx->list_wait,
3404                    atomic_read(&sctx->bios_in_flight) == 0);
3405         scrub_blocked_if_needed(fs_info);
3406
3407         /* FIXME it might be better to start readahead at commit root */
3408         key.objectid = logical;
3409         key.type = BTRFS_EXTENT_ITEM_KEY;
3410         key.offset = (u64)0;
3411         key_end.objectid = logic_end;
3412         key_end.type = BTRFS_METADATA_ITEM_KEY;
3413         key_end.offset = (u64)-1;
3414         reada1 = btrfs_reada_add(root, &key, &key_end);
3415
3416         key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3417         key.type = BTRFS_EXTENT_CSUM_KEY;
3418         key.offset = logical;
3419         key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3420         key_end.type = BTRFS_EXTENT_CSUM_KEY;
3421         key_end.offset = logic_end;
3422         reada2 = btrfs_reada_add(csum_root, &key, &key_end);
3423
3424         if (!IS_ERR(reada1))
3425                 btrfs_reada_wait(reada1);
3426         if (!IS_ERR(reada2))
3427                 btrfs_reada_wait(reada2);
3428
3429
3430         /*
3431          * collect all data csums for the stripe to avoid seeking during
3432          * the scrub. This might currently (crc32) end up to be about 1MB
3433          */
3434         blk_start_plug(&plug);
3435
3436         /*
3437          * now find all extents for each stripe and scrub them
3438          */
3439         ret = 0;
3440         while (physical < physical_end) {
3441                 /*
3442                  * canceled?
3443                  */
3444                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3445                     atomic_read(&sctx->cancel_req)) {
3446                         ret = -ECANCELED;
3447                         goto out;
3448                 }
3449                 /*
3450                  * check to see if we have to pause
3451                  */
3452                 if (atomic_read(&fs_info->scrub_pause_req)) {
3453                         /* push queued extents */
3454                         sctx->flush_all_writes = true;
3455                         scrub_submit(sctx);
3456                         mutex_lock(&sctx->wr_lock);
3457                         scrub_wr_submit(sctx);
3458                         mutex_unlock(&sctx->wr_lock);
3459                         wait_event(sctx->list_wait,
3460                                    atomic_read(&sctx->bios_in_flight) == 0);
3461                         sctx->flush_all_writes = false;
3462                         scrub_blocked_if_needed(fs_info);
3463                 }
3464
3465                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3466                         ret = get_raid56_logic_offset(physical, num, map,
3467                                                       &logical,
3468                                                       &stripe_logical);
3469                         logical += base;
3470                         if (ret) {
3471                                 /* it is parity strip */
3472                                 stripe_logical += base;
3473                                 stripe_end = stripe_logical + increment;
3474                                 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3475                                                           ppath, stripe_logical,
3476                                                           stripe_end);
3477                                 if (ret)
3478                                         goto out;
3479                                 goto skip;
3480                         }
3481                 }
3482
3483                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3484                         key.type = BTRFS_METADATA_ITEM_KEY;
3485                 else
3486                         key.type = BTRFS_EXTENT_ITEM_KEY;
3487                 key.objectid = logical;
3488                 key.offset = (u64)-1;
3489
3490                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3491                 if (ret < 0)
3492                         goto out;
3493
3494                 if (ret > 0) {
3495                         ret = btrfs_previous_extent_item(root, path, 0);
3496                         if (ret < 0)
3497                                 goto out;
3498                         if (ret > 0) {
3499                                 /* there's no smaller item, so stick with the
3500                                  * larger one */
3501                                 btrfs_release_path(path);
3502                                 ret = btrfs_search_slot(NULL, root, &key,
3503                                                         path, 0, 0);
3504                                 if (ret < 0)
3505                                         goto out;
3506                         }
3507                 }
3508
3509                 stop_loop = 0;
3510                 while (1) {
3511                         u64 bytes;
3512
3513                         l = path->nodes[0];
3514                         slot = path->slots[0];
3515                         if (slot >= btrfs_header_nritems(l)) {
3516                                 ret = btrfs_next_leaf(root, path);
3517                                 if (ret == 0)
3518                                         continue;
3519                                 if (ret < 0)
3520                                         goto out;
3521
3522                                 stop_loop = 1;
3523                                 break;
3524                         }
3525                         btrfs_item_key_to_cpu(l, &key, slot);
3526
3527                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3528                             key.type != BTRFS_METADATA_ITEM_KEY)
3529                                 goto next;
3530
3531                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3532                                 bytes = fs_info->nodesize;
3533                         else
3534                                 bytes = key.offset;
3535
3536                         if (key.objectid + bytes <= logical)
3537                                 goto next;
3538
3539                         if (key.objectid >= logical + map->stripe_len) {
3540                                 /* out of this device extent */
3541                                 if (key.objectid >= logic_end)
3542                                         stop_loop = 1;
3543                                 break;
3544                         }
3545
3546                         extent = btrfs_item_ptr(l, slot,
3547                                                 struct btrfs_extent_item);
3548                         flags = btrfs_extent_flags(l, extent);
3549                         generation = btrfs_extent_generation(l, extent);
3550
3551                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3552                             (key.objectid < logical ||
3553                              key.objectid + bytes >
3554                              logical + map->stripe_len)) {
3555                                 btrfs_err(fs_info,
3556                                            "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3557                                        key.objectid, logical);
3558                                 spin_lock(&sctx->stat_lock);
3559                                 sctx->stat.uncorrectable_errors++;
3560                                 spin_unlock(&sctx->stat_lock);
3561                                 goto next;
3562                         }
3563
3564 again:
3565                         extent_logical = key.objectid;
3566                         extent_len = bytes;
3567
3568                         /*
3569                          * trim extent to this stripe
3570                          */
3571                         if (extent_logical < logical) {
3572                                 extent_len -= logical - extent_logical;
3573                                 extent_logical = logical;
3574                         }
3575                         if (extent_logical + extent_len >
3576                             logical + map->stripe_len) {
3577                                 extent_len = logical + map->stripe_len -
3578                                              extent_logical;
3579                         }
3580
3581                         extent_physical = extent_logical - logical + physical;
3582                         extent_dev = scrub_dev;
3583                         extent_mirror_num = mirror_num;
3584                         if (is_dev_replace)
3585                                 scrub_remap_extent(fs_info, extent_logical,
3586                                                    extent_len, &extent_physical,
3587                                                    &extent_dev,
3588                                                    &extent_mirror_num);
3589
3590                         ret = btrfs_lookup_csums_range(csum_root,
3591                                                        extent_logical,
3592                                                        extent_logical +
3593                                                        extent_len - 1,
3594                                                        &sctx->csum_list, 1);
3595                         if (ret)
3596                                 goto out;
3597
3598                         ret = scrub_extent(sctx, extent_logical, extent_len,
3599                                            extent_physical, extent_dev, flags,
3600                                            generation, extent_mirror_num,
3601                                            extent_logical - logical + physical);
3602
3603                         scrub_free_csums(sctx);
3604
3605                         if (ret)
3606                                 goto out;
3607
3608                         if (extent_logical + extent_len <
3609                             key.objectid + bytes) {
3610                                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3611                                         /*
3612                                          * loop until we find next data stripe
3613                                          * or we have finished all stripes.
3614                                          */
3615 loop:
3616                                         physical += map->stripe_len;
3617                                         ret = get_raid56_logic_offset(physical,
3618                                                         num, map, &logical,
3619                                                         &stripe_logical);
3620                                         logical += base;
3621
3622                                         if (ret && physical < physical_end) {
3623                                                 stripe_logical += base;
3624                                                 stripe_end = stripe_logical +
3625                                                                 increment;
3626                                                 ret = scrub_raid56_parity(sctx,
3627                                                         map, scrub_dev, ppath,
3628                                                         stripe_logical,
3629                                                         stripe_end);
3630                                                 if (ret)
3631                                                         goto out;
3632                                                 goto loop;
3633                                         }
3634                                 } else {
3635                                         physical += map->stripe_len;
3636                                         logical += increment;
3637                                 }
3638                                 if (logical < key.objectid + bytes) {
3639                                         cond_resched();
3640                                         goto again;
3641                                 }
3642
3643                                 if (physical >= physical_end) {
3644                                         stop_loop = 1;
3645                                         break;
3646                                 }
3647                         }
3648 next:
3649                         path->slots[0]++;
3650                 }
3651                 btrfs_release_path(path);
3652 skip:
3653                 logical += increment;
3654                 physical += map->stripe_len;
3655                 spin_lock(&sctx->stat_lock);
3656                 if (stop_loop)
3657                         sctx->stat.last_physical = map->stripes[num].physical +
3658                                                    length;
3659                 else
3660                         sctx->stat.last_physical = physical;
3661                 spin_unlock(&sctx->stat_lock);
3662                 if (stop_loop)
3663                         break;
3664         }
3665 out:
3666         /* push queued extents */
3667         scrub_submit(sctx);
3668         mutex_lock(&sctx->wr_lock);
3669         scrub_wr_submit(sctx);
3670         mutex_unlock(&sctx->wr_lock);
3671
3672         blk_finish_plug(&plug);
3673         btrfs_free_path(path);
3674         btrfs_free_path(ppath);
3675         return ret < 0 ? ret : 0;
3676 }
3677
3678 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3679                                           struct btrfs_device *scrub_dev,
3680                                           u64 chunk_offset, u64 length,
3681                                           u64 dev_offset,
3682                                           struct btrfs_block_group_cache *cache,
3683                                           int is_dev_replace)
3684 {
3685         struct btrfs_fs_info *fs_info = sctx->fs_info;
3686         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3687         struct map_lookup *map;
3688         struct extent_map *em;
3689         int i;
3690         int ret = 0;
3691
3692         read_lock(&map_tree->map_tree.lock);
3693         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3694         read_unlock(&map_tree->map_tree.lock);
3695
3696         if (!em) {
3697                 /*
3698                  * Might have been an unused block group deleted by the cleaner
3699                  * kthread or relocation.
3700                  */
3701                 spin_lock(&cache->lock);
3702                 if (!cache->removed)
3703                         ret = -EINVAL;
3704                 spin_unlock(&cache->lock);
3705
3706                 return ret;
3707         }
3708
3709         map = em->map_lookup;
3710         if (em->start != chunk_offset)
3711                 goto out;
3712
3713         if (em->len < length)
3714                 goto out;
3715
3716         for (i = 0; i < map->num_stripes; ++i) {
3717                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3718                     map->stripes[i].physical == dev_offset) {
3719                         ret = scrub_stripe(sctx, map, scrub_dev, i,
3720                                            chunk_offset, length,
3721                                            is_dev_replace);
3722                         if (ret)
3723                                 goto out;
3724                 }
3725         }
3726 out:
3727         free_extent_map(em);
3728
3729         return ret;
3730 }
3731
3732 static noinline_for_stack
3733 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3734                            struct btrfs_device *scrub_dev, u64 start, u64 end,
3735                            int is_dev_replace)
3736 {
3737         struct btrfs_dev_extent *dev_extent = NULL;
3738         struct btrfs_path *path;
3739         struct btrfs_fs_info *fs_info = sctx->fs_info;
3740         struct btrfs_root *root = fs_info->dev_root;
3741         u64 length;
3742         u64 chunk_offset;
3743         int ret = 0;
3744         int ro_set;
3745         int slot;
3746         struct extent_buffer *l;
3747         struct btrfs_key key;
3748         struct btrfs_key found_key;
3749         struct btrfs_block_group_cache *cache;
3750         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3751
3752         path = btrfs_alloc_path();
3753         if (!path)
3754                 return -ENOMEM;
3755
3756         path->reada = READA_FORWARD;
3757         path->search_commit_root = 1;
3758         path->skip_locking = 1;
3759
3760         key.objectid = scrub_dev->devid;
3761         key.offset = 0ull;
3762         key.type = BTRFS_DEV_EXTENT_KEY;
3763
3764         while (1) {
3765                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3766                 if (ret < 0)
3767                         break;
3768                 if (ret > 0) {
3769                         if (path->slots[0] >=
3770                             btrfs_header_nritems(path->nodes[0])) {
3771                                 ret = btrfs_next_leaf(root, path);
3772                                 if (ret < 0)
3773                                         break;
3774                                 if (ret > 0) {
3775                                         ret = 0;
3776                                         break;
3777                                 }
3778                         } else {
3779                                 ret = 0;
3780                         }
3781                 }
3782
3783                 l = path->nodes[0];
3784                 slot = path->slots[0];
3785
3786                 btrfs_item_key_to_cpu(l, &found_key, slot);
3787
3788                 if (found_key.objectid != scrub_dev->devid)
3789                         break;
3790
3791                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3792                         break;
3793
3794                 if (found_key.offset >= end)
3795                         break;
3796
3797                 if (found_key.offset < key.offset)
3798                         break;
3799
3800                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3801                 length = btrfs_dev_extent_length(l, dev_extent);
3802
3803                 if (found_key.offset + length <= start)
3804                         goto skip;
3805
3806                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3807
3808                 /*
3809                  * get a reference on the corresponding block group to prevent
3810                  * the chunk from going away while we scrub it
3811                  */
3812                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3813
3814                 /* some chunks are removed but not committed to disk yet,
3815                  * continue scrubbing */
3816                 if (!cache)
3817                         goto skip;
3818
3819                 /*
3820                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3821                  * to avoid deadlock caused by:
3822                  * btrfs_inc_block_group_ro()
3823                  * -> btrfs_wait_for_commit()
3824                  * -> btrfs_commit_transaction()
3825                  * -> btrfs_scrub_pause()
3826                  */
3827                 scrub_pause_on(fs_info);
3828                 ret = btrfs_inc_block_group_ro(fs_info, cache);
3829                 if (!ret && is_dev_replace) {
3830                         /*
3831                          * If we are doing a device replace wait for any tasks
3832                          * that started dellaloc right before we set the block
3833                          * group to RO mode, as they might have just allocated
3834                          * an extent from it or decided they could do a nocow
3835                          * write. And if any such tasks did that, wait for their
3836                          * ordered extents to complete and then commit the
3837                          * current transaction, so that we can later see the new
3838                          * extent items in the extent tree - the ordered extents
3839                          * create delayed data references (for cow writes) when
3840                          * they complete, which will be run and insert the
3841                          * corresponding extent items into the extent tree when
3842                          * we commit the transaction they used when running
3843                          * inode.c:btrfs_finish_ordered_io(). We later use
3844                          * the commit root of the extent tree to find extents
3845                          * to copy from the srcdev into the tgtdev, and we don't
3846                          * want to miss any new extents.
3847                          */
3848                         btrfs_wait_block_group_reservations(cache);
3849                         btrfs_wait_nocow_writers(cache);
3850                         ret = btrfs_wait_ordered_roots(fs_info, U64_MAX,
3851                                                        cache->key.objectid,
3852                                                        cache->key.offset);
3853                         if (ret > 0) {
3854                                 struct btrfs_trans_handle *trans;
3855
3856                                 trans = btrfs_join_transaction(root);
3857                                 if (IS_ERR(trans))
3858                                         ret = PTR_ERR(trans);
3859                                 else
3860                                         ret = btrfs_commit_transaction(trans);
3861                                 if (ret) {
3862                                         scrub_pause_off(fs_info);
3863                                         btrfs_put_block_group(cache);
3864                                         break;
3865                                 }
3866                         }
3867                 }
3868                 scrub_pause_off(fs_info);
3869
3870                 if (ret == 0) {
3871                         ro_set = 1;
3872                 } else if (ret == -ENOSPC) {
3873                         /*
3874                          * btrfs_inc_block_group_ro return -ENOSPC when it
3875                          * failed in creating new chunk for metadata.
3876                          * It is not a problem for scrub/replace, because
3877                          * metadata are always cowed, and our scrub paused
3878                          * commit_transactions.
3879                          */
3880                         ro_set = 0;
3881                 } else {
3882                         btrfs_warn(fs_info,
3883                                    "failed setting block group ro: %d", ret);
3884                         btrfs_put_block_group(cache);
3885                         break;
3886                 }
3887
3888                 btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
3889                 dev_replace->cursor_right = found_key.offset + length;
3890                 dev_replace->cursor_left = found_key.offset;
3891                 dev_replace->item_needs_writeback = 1;
3892                 btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
3893                 ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
3894                                   found_key.offset, cache, is_dev_replace);
3895
3896                 /*
3897                  * flush, submit all pending read and write bios, afterwards
3898                  * wait for them.
3899                  * Note that in the dev replace case, a read request causes
3900                  * write requests that are submitted in the read completion
3901                  * worker. Therefore in the current situation, it is required
3902                  * that all write requests are flushed, so that all read and
3903                  * write requests are really completed when bios_in_flight
3904                  * changes to 0.
3905                  */
3906                 sctx->flush_all_writes = true;
3907                 scrub_submit(sctx);
3908                 mutex_lock(&sctx->wr_lock);
3909                 scrub_wr_submit(sctx);
3910                 mutex_unlock(&sctx->wr_lock);
3911
3912                 wait_event(sctx->list_wait,
3913                            atomic_read(&sctx->bios_in_flight) == 0);
3914
3915                 scrub_pause_on(fs_info);
3916
3917                 /*
3918                  * must be called before we decrease @scrub_paused.
3919                  * make sure we don't block transaction commit while
3920                  * we are waiting pending workers finished.
3921                  */
3922                 wait_event(sctx->list_wait,
3923                            atomic_read(&sctx->workers_pending) == 0);
3924                 sctx->flush_all_writes = false;
3925
3926                 scrub_pause_off(fs_info);
3927
3928                 btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
3929                 dev_replace->cursor_left = dev_replace->cursor_right;
3930                 dev_replace->item_needs_writeback = 1;
3931                 btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
3932
3933                 if (ro_set)
3934                         btrfs_dec_block_group_ro(cache);
3935
3936                 /*
3937                  * We might have prevented the cleaner kthread from deleting
3938                  * this block group if it was already unused because we raced
3939                  * and set it to RO mode first. So add it back to the unused
3940                  * list, otherwise it might not ever be deleted unless a manual
3941                  * balance is triggered or it becomes used and unused again.
3942                  */
3943                 spin_lock(&cache->lock);
3944                 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3945                     btrfs_block_group_used(&cache->item) == 0) {
3946                         spin_unlock(&cache->lock);
3947                         spin_lock(&fs_info->unused_bgs_lock);
3948                         if (list_empty(&cache->bg_list)) {
3949                                 btrfs_get_block_group(cache);
3950                                 list_add_tail(&cache->bg_list,
3951                                               &fs_info->unused_bgs);
3952                         }
3953                         spin_unlock(&fs_info->unused_bgs_lock);
3954                 } else {
3955                         spin_unlock(&cache->lock);
3956                 }
3957
3958                 btrfs_put_block_group(cache);
3959                 if (ret)
3960                         break;
3961                 if (is_dev_replace &&
3962                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3963                         ret = -EIO;
3964                         break;
3965                 }
3966                 if (sctx->stat.malloc_errors > 0) {
3967                         ret = -ENOMEM;
3968                         break;
3969                 }
3970 skip:
3971                 key.offset = found_key.offset + length;
3972                 btrfs_release_path(path);
3973         }
3974
3975         btrfs_free_path(path);
3976
3977         return ret;
3978 }
3979
3980 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3981                                            struct btrfs_device *scrub_dev)
3982 {
3983         int     i;
3984         u64     bytenr;
3985         u64     gen;
3986         int     ret;
3987         struct btrfs_fs_info *fs_info = sctx->fs_info;
3988
3989         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3990                 return -EIO;
3991
3992         /* Seed devices of a new filesystem has their own generation. */
3993         if (scrub_dev->fs_devices != fs_info->fs_devices)
3994                 gen = scrub_dev->generation;
3995         else
3996                 gen = fs_info->last_trans_committed;
3997
3998         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3999                 bytenr = btrfs_sb_offset(i);
4000                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4001                     scrub_dev->commit_total_bytes)
4002                         break;
4003
4004                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4005                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4006                                   NULL, 1, bytenr);
4007                 if (ret)
4008                         return ret;
4009         }
4010         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4011
4012         return 0;
4013 }
4014
4015 /*
4016  * get a reference count on fs_info->scrub_workers. start worker if necessary
4017  */
4018 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4019                                                 int is_dev_replace)
4020 {
4021         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4022         int max_active = fs_info->thread_pool_size;
4023
4024         if (fs_info->scrub_workers_refcnt == 0) {
4025                 fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
4026                                 flags, is_dev_replace ? 1 : max_active, 4);
4027                 if (!fs_info->scrub_workers)
4028                         goto fail_scrub_workers;
4029
4030                 fs_info->scrub_wr_completion_workers =
4031                         btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
4032                                               max_active, 2);
4033                 if (!fs_info->scrub_wr_completion_workers)
4034                         goto fail_scrub_wr_completion_workers;
4035
4036                 fs_info->scrub_nocow_workers =
4037                         btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0);
4038                 if (!fs_info->scrub_nocow_workers)
4039                         goto fail_scrub_nocow_workers;
4040                 fs_info->scrub_parity_workers =
4041                         btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
4042                                               max_active, 2);
4043                 if (!fs_info->scrub_parity_workers)
4044                         goto fail_scrub_parity_workers;
4045         }
4046         ++fs_info->scrub_workers_refcnt;
4047         return 0;
4048
4049 fail_scrub_parity_workers:
4050         btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
4051 fail_scrub_nocow_workers:
4052         btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
4053 fail_scrub_wr_completion_workers:
4054         btrfs_destroy_workqueue(fs_info->scrub_workers);
4055 fail_scrub_workers:
4056         return -ENOMEM;
4057 }
4058
4059 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
4060 {
4061         if (--fs_info->scrub_workers_refcnt == 0) {
4062                 btrfs_destroy_workqueue(fs_info->scrub_workers);
4063                 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
4064                 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
4065                 btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
4066         }
4067         WARN_ON(fs_info->scrub_workers_refcnt < 0);
4068 }
4069
4070 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4071                     u64 end, struct btrfs_scrub_progress *progress,
4072                     int readonly, int is_dev_replace)
4073 {
4074         struct scrub_ctx *sctx;
4075         int ret;
4076         struct btrfs_device *dev;
4077         struct rcu_string *name;
4078
4079         if (btrfs_fs_closing(fs_info))
4080                 return -EINVAL;
4081
4082         if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4083                 /*
4084                  * in this case scrub is unable to calculate the checksum
4085                  * the way scrub is implemented. Do not handle this
4086                  * situation at all because it won't ever happen.
4087                  */
4088                 btrfs_err(fs_info,
4089                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4090                        fs_info->nodesize,
4091                        BTRFS_STRIPE_LEN);
4092                 return -EINVAL;
4093         }
4094
4095         if (fs_info->sectorsize != PAGE_SIZE) {
4096                 /* not supported for data w/o checksums */
4097                 btrfs_err_rl(fs_info,
4098                            "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
4099                        fs_info->sectorsize, PAGE_SIZE);
4100                 return -EINVAL;
4101         }
4102
4103         if (fs_info->nodesize >
4104             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
4105             fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
4106                 /*
4107                  * would exhaust the array bounds of pagev member in
4108                  * struct scrub_block
4109                  */
4110                 btrfs_err(fs_info,
4111                           "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
4112                        fs_info->nodesize,
4113                        SCRUB_MAX_PAGES_PER_BLOCK,
4114                        fs_info->sectorsize,
4115                        SCRUB_MAX_PAGES_PER_BLOCK);
4116                 return -EINVAL;
4117         }
4118
4119
4120         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4121         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
4122         if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4123                      !is_dev_replace)) {
4124                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4125                 return -ENODEV;
4126         }
4127
4128         if (!is_dev_replace && !readonly &&
4129             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4130                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4131                 rcu_read_lock();
4132                 name = rcu_dereference(dev->name);
4133                 btrfs_err(fs_info, "scrub: device %s is not writable",
4134                           name->str);
4135                 rcu_read_unlock();
4136                 return -EROFS;
4137         }
4138
4139         mutex_lock(&fs_info->scrub_lock);
4140         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4141             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4142                 mutex_unlock(&fs_info->scrub_lock);
4143                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4144                 return -EIO;
4145         }
4146
4147         btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
4148         if (dev->scrub_ctx ||
4149             (!is_dev_replace &&
4150              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4151                 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
4152                 mutex_unlock(&fs_info->scrub_lock);
4153                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4154                 return -EINPROGRESS;
4155         }
4156         btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
4157
4158         ret = scrub_workers_get(fs_info, is_dev_replace);
4159         if (ret) {
4160                 mutex_unlock(&fs_info->scrub_lock);
4161                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4162                 return ret;
4163         }
4164
4165         sctx = scrub_setup_ctx(dev, is_dev_replace);
4166         if (IS_ERR(sctx)) {
4167                 mutex_unlock(&fs_info->scrub_lock);
4168                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4169                 scrub_workers_put(fs_info);
4170                 return PTR_ERR(sctx);
4171         }
4172         sctx->readonly = readonly;
4173         dev->scrub_ctx = sctx;
4174         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4175
4176         /*
4177          * checking @scrub_pause_req here, we can avoid
4178          * race between committing transaction and scrubbing.
4179          */
4180         __scrub_blocked_if_needed(fs_info);
4181         atomic_inc(&fs_info->scrubs_running);
4182         mutex_unlock(&fs_info->scrub_lock);
4183
4184         if (!is_dev_replace) {
4185                 /*
4186                  * by holding device list mutex, we can
4187                  * kick off writing super in log tree sync.
4188                  */
4189                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4190                 ret = scrub_supers(sctx, dev);
4191                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4192         }
4193
4194         if (!ret)
4195                 ret = scrub_enumerate_chunks(sctx, dev, start, end,
4196                                              is_dev_replace);
4197
4198         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4199         atomic_dec(&fs_info->scrubs_running);
4200         wake_up(&fs_info->scrub_pause_wait);
4201
4202         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4203
4204         if (progress)
4205                 memcpy(progress, &sctx->stat, sizeof(*progress));
4206
4207         mutex_lock(&fs_info->scrub_lock);
4208         dev->scrub_ctx = NULL;
4209         scrub_workers_put(fs_info);
4210         mutex_unlock(&fs_info->scrub_lock);
4211
4212         scrub_put_ctx(sctx);
4213
4214         return ret;
4215 }
4216
4217 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4218 {
4219         mutex_lock(&fs_info->scrub_lock);
4220         atomic_inc(&fs_info->scrub_pause_req);
4221         while (atomic_read(&fs_info->scrubs_paused) !=
4222                atomic_read(&fs_info->scrubs_running)) {
4223                 mutex_unlock(&fs_info->scrub_lock);
4224                 wait_event(fs_info->scrub_pause_wait,
4225                            atomic_read(&fs_info->scrubs_paused) ==
4226                            atomic_read(&fs_info->scrubs_running));
4227                 mutex_lock(&fs_info->scrub_lock);
4228         }
4229         mutex_unlock(&fs_info->scrub_lock);
4230 }
4231
4232 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4233 {
4234         atomic_dec(&fs_info->scrub_pause_req);
4235         wake_up(&fs_info->scrub_pause_wait);
4236 }
4237
4238 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4239 {
4240         mutex_lock(&fs_info->scrub_lock);
4241         if (!atomic_read(&fs_info->scrubs_running)) {
4242                 mutex_unlock(&fs_info->scrub_lock);
4243                 return -ENOTCONN;
4244         }
4245
4246         atomic_inc(&fs_info->scrub_cancel_req);
4247         while (atomic_read(&fs_info->scrubs_running)) {
4248                 mutex_unlock(&fs_info->scrub_lock);
4249                 wait_event(fs_info->scrub_pause_wait,
4250                            atomic_read(&fs_info->scrubs_running) == 0);
4251                 mutex_lock(&fs_info->scrub_lock);
4252         }
4253         atomic_dec(&fs_info->scrub_cancel_req);
4254         mutex_unlock(&fs_info->scrub_lock);
4255
4256         return 0;
4257 }
4258
4259 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
4260                            struct btrfs_device *dev)
4261 {
4262         struct scrub_ctx *sctx;
4263
4264         mutex_lock(&fs_info->scrub_lock);
4265         sctx = dev->scrub_ctx;
4266         if (!sctx) {
4267                 mutex_unlock(&fs_info->scrub_lock);
4268                 return -ENOTCONN;
4269         }
4270         atomic_inc(&sctx->cancel_req);
4271         while (dev->scrub_ctx) {
4272                 mutex_unlock(&fs_info->scrub_lock);
4273                 wait_event(fs_info->scrub_pause_wait,
4274                            dev->scrub_ctx == NULL);
4275                 mutex_lock(&fs_info->scrub_lock);
4276         }
4277         mutex_unlock(&fs_info->scrub_lock);
4278
4279         return 0;
4280 }
4281
4282 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4283                          struct btrfs_scrub_progress *progress)
4284 {
4285         struct btrfs_device *dev;
4286         struct scrub_ctx *sctx = NULL;
4287
4288         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4289         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
4290         if (dev)
4291                 sctx = dev->scrub_ctx;
4292         if (sctx)
4293                 memcpy(progress, &sctx->stat, sizeof(*progress));
4294         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4295
4296         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4297 }
4298
4299 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4300                                u64 extent_logical, u64 extent_len,
4301                                u64 *extent_physical,
4302                                struct btrfs_device **extent_dev,
4303                                int *extent_mirror_num)
4304 {
4305         u64 mapped_length;
4306         struct btrfs_bio *bbio = NULL;
4307         int ret;
4308
4309         mapped_length = extent_len;
4310         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4311                               &mapped_length, &bbio, 0);
4312         if (ret || !bbio || mapped_length < extent_len ||
4313             !bbio->stripes[0].dev->bdev) {
4314                 btrfs_put_bbio(bbio);
4315                 return;
4316         }
4317
4318         *extent_physical = bbio->stripes[0].physical;
4319         *extent_mirror_num = bbio->mirror_num;
4320         *extent_dev = bbio->stripes[0].dev;
4321         btrfs_put_bbio(bbio);
4322 }
4323
4324 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
4325                             int mirror_num, u64 physical_for_dev_replace)
4326 {
4327         struct scrub_copy_nocow_ctx *nocow_ctx;
4328         struct btrfs_fs_info *fs_info = sctx->fs_info;
4329
4330         nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
4331         if (!nocow_ctx) {
4332                 spin_lock(&sctx->stat_lock);
4333                 sctx->stat.malloc_errors++;
4334                 spin_unlock(&sctx->stat_lock);
4335                 return -ENOMEM;
4336         }
4337
4338         scrub_pending_trans_workers_inc(sctx);
4339
4340         nocow_ctx->sctx = sctx;
4341         nocow_ctx->logical = logical;
4342         nocow_ctx->len = len;
4343         nocow_ctx->mirror_num = mirror_num;
4344         nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
4345         btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
4346                         copy_nocow_pages_worker, NULL, NULL);
4347         INIT_LIST_HEAD(&nocow_ctx->inodes);
4348         btrfs_queue_work(fs_info->scrub_nocow_workers,
4349                          &nocow_ctx->work);
4350
4351         return 0;
4352 }
4353
4354 static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
4355 {
4356         struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
4357         struct scrub_nocow_inode *nocow_inode;
4358
4359         nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
4360         if (!nocow_inode)
4361                 return -ENOMEM;
4362         nocow_inode->inum = inum;
4363         nocow_inode->offset = offset;
4364         nocow_inode->root = root;
4365         list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
4366         return 0;
4367 }
4368
4369 #define COPY_COMPLETE 1
4370
4371 static void copy_nocow_pages_worker(struct btrfs_work *work)
4372 {
4373         struct scrub_copy_nocow_ctx *nocow_ctx =
4374                 container_of(work, struct scrub_copy_nocow_ctx, work);
4375         struct scrub_ctx *sctx = nocow_ctx->sctx;
4376         struct btrfs_fs_info *fs_info = sctx->fs_info;
4377         struct btrfs_root *root = fs_info->extent_root;
4378         u64 logical = nocow_ctx->logical;
4379         u64 len = nocow_ctx->len;
4380         int mirror_num = nocow_ctx->mirror_num;
4381         u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4382         int ret;
4383         struct btrfs_trans_handle *trans = NULL;
4384         struct btrfs_path *path;
4385         int not_written = 0;
4386
4387         path = btrfs_alloc_path();
4388         if (!path) {
4389                 spin_lock(&sctx->stat_lock);
4390                 sctx->stat.malloc_errors++;
4391                 spin_unlock(&sctx->stat_lock);
4392                 not_written = 1;
4393                 goto out;
4394         }
4395
4396         trans = btrfs_join_transaction(root);
4397         if (IS_ERR(trans)) {
4398                 not_written = 1;
4399                 goto out;
4400         }
4401
4402         ret = iterate_inodes_from_logical(logical, fs_info, path,
4403                         record_inode_for_nocow, nocow_ctx, false);
4404         if (ret != 0 && ret != -ENOENT) {
4405                 btrfs_warn(fs_info,
4406                            "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d",
4407                            logical, physical_for_dev_replace, len, mirror_num,
4408                            ret);
4409                 not_written = 1;
4410                 goto out;
4411         }
4412
4413         btrfs_end_transaction(trans);
4414         trans = NULL;
4415         while (!list_empty(&nocow_ctx->inodes)) {
4416                 struct scrub_nocow_inode *entry;
4417                 entry = list_first_entry(&nocow_ctx->inodes,
4418                                          struct scrub_nocow_inode,
4419                                          list);
4420                 list_del_init(&entry->list);
4421                 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
4422                                                  entry->root, nocow_ctx);
4423                 kfree(entry);
4424                 if (ret == COPY_COMPLETE) {
4425                         ret = 0;
4426                         break;
4427                 } else if (ret) {
4428                         break;
4429                 }
4430         }
4431 out:
4432         while (!list_empty(&nocow_ctx->inodes)) {
4433                 struct scrub_nocow_inode *entry;
4434                 entry = list_first_entry(&nocow_ctx->inodes,
4435                                          struct scrub_nocow_inode,
4436                                          list);
4437                 list_del_init(&entry->list);
4438                 kfree(entry);
4439         }
4440         if (trans && !IS_ERR(trans))
4441                 btrfs_end_transaction(trans);
4442         if (not_written)
4443                 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
4444                                             num_uncorrectable_read_errors);
4445
4446         btrfs_free_path(path);
4447         kfree(nocow_ctx);
4448
4449         scrub_pending_trans_workers_dec(sctx);
4450 }
4451
4452 static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len,
4453                                  u64 logical)
4454 {
4455         struct extent_state *cached_state = NULL;
4456         struct btrfs_ordered_extent *ordered;
4457         struct extent_io_tree *io_tree;
4458         struct extent_map *em;
4459         u64 lockstart = start, lockend = start + len - 1;
4460         int ret = 0;
4461
4462         io_tree = &inode->io_tree;
4463
4464         lock_extent_bits(io_tree, lockstart, lockend, &cached_state);
4465         ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
4466         if (ordered) {
4467                 btrfs_put_ordered_extent(ordered);
4468                 ret = 1;
4469                 goto out_unlock;
4470         }
4471
4472         em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4473         if (IS_ERR(em)) {
4474                 ret = PTR_ERR(em);
4475                 goto out_unlock;
4476         }
4477
4478         /*
4479          * This extent does not actually cover the logical extent anymore,
4480          * move on to the next inode.
4481          */
4482         if (em->block_start > logical ||
4483             em->block_start + em->block_len < logical + len) {
4484                 free_extent_map(em);
4485                 ret = 1;
4486                 goto out_unlock;
4487         }
4488         free_extent_map(em);
4489
4490 out_unlock:
4491         unlock_extent_cached(io_tree, lockstart, lockend, &cached_state);
4492         return ret;
4493 }
4494
4495 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
4496                                       struct scrub_copy_nocow_ctx *nocow_ctx)
4497 {
4498         struct btrfs_fs_info *fs_info = nocow_ctx->sctx->fs_info;
4499         struct btrfs_key key;
4500         struct inode *inode;
4501         struct page *page;
4502         struct btrfs_root *local_root;
4503         struct extent_io_tree *io_tree;
4504         u64 physical_for_dev_replace;
4505         u64 nocow_ctx_logical;
4506         u64 len = nocow_ctx->len;
4507         unsigned long index;
4508         int srcu_index;
4509         int ret = 0;
4510         int err = 0;
4511
4512         key.objectid = root;
4513         key.type = BTRFS_ROOT_ITEM_KEY;
4514         key.offset = (u64)-1;
4515
4516         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
4517
4518         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
4519         if (IS_ERR(local_root)) {
4520                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4521                 return PTR_ERR(local_root);
4522         }
4523
4524         key.type = BTRFS_INODE_ITEM_KEY;
4525         key.objectid = inum;
4526         key.offset = 0;
4527         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
4528         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4529         if (IS_ERR(inode))
4530                 return PTR_ERR(inode);
4531
4532         /* Avoid truncate/dio/punch hole.. */
4533         inode_lock(inode);
4534         inode_dio_wait(inode);
4535
4536         physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4537         io_tree = &BTRFS_I(inode)->io_tree;
4538         nocow_ctx_logical = nocow_ctx->logical;
4539
4540         ret = check_extent_to_block(BTRFS_I(inode), offset, len,
4541                         nocow_ctx_logical);
4542         if (ret) {
4543                 ret = ret > 0 ? 0 : ret;
4544                 goto out;
4545         }
4546
4547         while (len >= PAGE_SIZE) {
4548                 index = offset >> PAGE_SHIFT;
4549 again:
4550                 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
4551                 if (!page) {
4552                         btrfs_err(fs_info, "find_or_create_page() failed");
4553                         ret = -ENOMEM;
4554                         goto out;
4555                 }
4556
4557                 if (PageUptodate(page)) {
4558                         if (PageDirty(page))
4559                                 goto next_page;
4560                 } else {
4561                         ClearPageError(page);
4562                         err = extent_read_full_page(io_tree, page,
4563                                                            btrfs_get_extent,
4564                                                            nocow_ctx->mirror_num);
4565                         if (err) {
4566                                 ret = err;
4567                                 goto next_page;
4568                         }
4569
4570                         lock_page(page);
4571                         /*
4572                          * If the page has been remove from the page cache,
4573                          * the data on it is meaningless, because it may be
4574                          * old one, the new data may be written into the new
4575                          * page in the page cache.
4576                          */
4577                         if (page->mapping != inode->i_mapping) {
4578                                 unlock_page(page);
4579                                 put_page(page);
4580                                 goto again;
4581                         }
4582                         if (!PageUptodate(page)) {
4583                                 ret = -EIO;
4584                                 goto next_page;
4585                         }
4586                 }
4587
4588                 ret = check_extent_to_block(BTRFS_I(inode), offset, len,
4589                                             nocow_ctx_logical);
4590                 if (ret) {
4591                         ret = ret > 0 ? 0 : ret;
4592                         goto next_page;
4593                 }
4594
4595                 err = write_page_nocow(nocow_ctx->sctx,
4596                                        physical_for_dev_replace, page);
4597                 if (err)
4598                         ret = err;
4599 next_page:
4600                 unlock_page(page);
4601                 put_page(page);
4602
4603                 if (ret)
4604                         break;
4605
4606                 offset += PAGE_SIZE;
4607                 physical_for_dev_replace += PAGE_SIZE;
4608                 nocow_ctx_logical += PAGE_SIZE;
4609                 len -= PAGE_SIZE;
4610         }
4611         ret = COPY_COMPLETE;
4612 out:
4613         inode_unlock(inode);
4614         iput(inode);
4615         return ret;
4616 }
4617
4618 static int write_page_nocow(struct scrub_ctx *sctx,
4619                             u64 physical_for_dev_replace, struct page *page)
4620 {
4621         struct bio *bio;
4622         struct btrfs_device *dev;
4623         int ret;
4624
4625         dev = sctx->wr_tgtdev;
4626         if (!dev)
4627                 return -EIO;
4628         if (!dev->bdev) {
4629                 btrfs_warn_rl(dev->fs_info,
4630                         "scrub write_page_nocow(bdev == NULL) is unexpected");
4631                 return -EIO;
4632         }
4633         bio = btrfs_io_bio_alloc(1);
4634         bio->bi_iter.bi_size = 0;
4635         bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
4636         bio_set_dev(bio, dev->bdev);
4637         bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
4638         ret = bio_add_page(bio, page, PAGE_SIZE, 0);
4639         if (ret != PAGE_SIZE) {
4640 leave_with_eio:
4641                 bio_put(bio);
4642                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4643                 return -EIO;
4644         }
4645
4646         if (btrfsic_submit_bio_wait(bio))
4647                 goto leave_with_eio;
4648
4649         bio_put(bio);
4650         return 0;
4651 }