btrfs: open-code bio_set_op_attrs
[sfrench/cifs-2.6.git] / fs / btrfs / scrub.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
4  */
5
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include "ctree.h"
10 #include "volumes.h"
11 #include "disk-io.h"
12 #include "ordered-data.h"
13 #include "transaction.h"
14 #include "backref.h"
15 #include "extent_io.h"
16 #include "dev-replace.h"
17 #include "check-integrity.h"
18 #include "rcu-string.h"
19 #include "raid56.h"
20
21 /*
22  * This is only the first step towards a full-features scrub. It reads all
23  * extent and super block and verifies the checksums. In case a bad checksum
24  * is found or the extent cannot be read, good data will be written back if
25  * any can be found.
26  *
27  * Future enhancements:
28  *  - In case an unrepairable extent is encountered, track which files are
29  *    affected and report them
30  *  - track and record media errors, throw out bad devices
31  *  - add a mode to also read unallocated space
32  */
33
34 struct scrub_block;
35 struct scrub_ctx;
36
37 /*
38  * the following three values only influence the performance.
39  * The last one configures the number of parallel and outstanding I/O
40  * operations. The first two values configure an upper limit for the number
41  * of (dynamically allocated) pages that are added to a bio.
42  */
43 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
44 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
45 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
46
47 /*
48  * the following value times PAGE_SIZE needs to be large enough to match the
49  * largest node/leaf/sector size that shall be supported.
50  * Values larger than BTRFS_STRIPE_LEN are not supported.
51  */
52 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
53
54 struct scrub_recover {
55         refcount_t              refs;
56         struct btrfs_bio        *bbio;
57         u64                     map_length;
58 };
59
60 struct scrub_page {
61         struct scrub_block      *sblock;
62         struct page             *page;
63         struct btrfs_device     *dev;
64         struct list_head        list;
65         u64                     flags;  /* extent flags */
66         u64                     generation;
67         u64                     logical;
68         u64                     physical;
69         u64                     physical_for_dev_replace;
70         atomic_t                refs;
71         struct {
72                 unsigned int    mirror_num:8;
73                 unsigned int    have_csum:1;
74                 unsigned int    io_error:1;
75         };
76         u8                      csum[BTRFS_CSUM_SIZE];
77
78         struct scrub_recover    *recover;
79 };
80
81 struct scrub_bio {
82         int                     index;
83         struct scrub_ctx        *sctx;
84         struct btrfs_device     *dev;
85         struct bio              *bio;
86         blk_status_t            status;
87         u64                     logical;
88         u64                     physical;
89 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
90         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
91 #else
92         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
93 #endif
94         int                     page_count;
95         int                     next_free;
96         struct btrfs_work       work;
97 };
98
99 struct scrub_block {
100         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
101         int                     page_count;
102         atomic_t                outstanding_pages;
103         refcount_t              refs; /* free mem on transition to zero */
104         struct scrub_ctx        *sctx;
105         struct scrub_parity     *sparity;
106         struct {
107                 unsigned int    header_error:1;
108                 unsigned int    checksum_error:1;
109                 unsigned int    no_io_error_seen:1;
110                 unsigned int    generation_error:1; /* also sets header_error */
111
112                 /* The following is for the data used to check parity */
113                 /* It is for the data with checksum */
114                 unsigned int    data_corrected:1;
115         };
116         struct btrfs_work       work;
117 };
118
119 /* Used for the chunks with parity stripe such RAID5/6 */
120 struct scrub_parity {
121         struct scrub_ctx        *sctx;
122
123         struct btrfs_device     *scrub_dev;
124
125         u64                     logic_start;
126
127         u64                     logic_end;
128
129         int                     nsectors;
130
131         u64                     stripe_len;
132
133         refcount_t              refs;
134
135         struct list_head        spages;
136
137         /* Work of parity check and repair */
138         struct btrfs_work       work;
139
140         /* Mark the parity blocks which have data */
141         unsigned long           *dbitmap;
142
143         /*
144          * Mark the parity blocks which have data, but errors happen when
145          * read data or check data
146          */
147         unsigned long           *ebitmap;
148
149         unsigned long           bitmap[0];
150 };
151
152 struct scrub_ctx {
153         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
154         struct btrfs_fs_info    *fs_info;
155         int                     first_free;
156         int                     curr;
157         atomic_t                bios_in_flight;
158         atomic_t                workers_pending;
159         spinlock_t              list_lock;
160         wait_queue_head_t       list_wait;
161         u16                     csum_size;
162         struct list_head        csum_list;
163         atomic_t                cancel_req;
164         int                     readonly;
165         int                     pages_per_rd_bio;
166
167         int                     is_dev_replace;
168
169         struct scrub_bio        *wr_curr_bio;
170         struct mutex            wr_lock;
171         int                     pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
172         struct btrfs_device     *wr_tgtdev;
173         bool                    flush_all_writes;
174
175         /*
176          * statistics
177          */
178         struct btrfs_scrub_progress stat;
179         spinlock_t              stat_lock;
180
181         /*
182          * Use a ref counter to avoid use-after-free issues. Scrub workers
183          * decrement bios_in_flight and workers_pending and then do a wakeup
184          * on the list_wait wait queue. We must ensure the main scrub task
185          * doesn't free the scrub context before or while the workers are
186          * doing the wakeup() call.
187          */
188         refcount_t              refs;
189 };
190
191 struct scrub_fixup_nodatasum {
192         struct scrub_ctx        *sctx;
193         struct btrfs_device     *dev;
194         u64                     logical;
195         struct btrfs_root       *root;
196         struct btrfs_work       work;
197         int                     mirror_num;
198 };
199
200 struct scrub_warning {
201         struct btrfs_path       *path;
202         u64                     extent_item_size;
203         const char              *errstr;
204         u64                     physical;
205         u64                     logical;
206         struct btrfs_device     *dev;
207 };
208
209 struct full_stripe_lock {
210         struct rb_node node;
211         u64 logical;
212         u64 refs;
213         struct mutex mutex;
214 };
215
216 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
217 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
218 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
219 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
220 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
221 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
222                                      struct scrub_block *sblocks_for_recheck);
223 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
224                                 struct scrub_block *sblock,
225                                 int retry_failed_mirror);
226 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
227 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
228                                              struct scrub_block *sblock_good);
229 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
230                                             struct scrub_block *sblock_good,
231                                             int page_num, int force_write);
232 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
233 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
234                                            int page_num);
235 static int scrub_checksum_data(struct scrub_block *sblock);
236 static int scrub_checksum_tree_block(struct scrub_block *sblock);
237 static int scrub_checksum_super(struct scrub_block *sblock);
238 static void scrub_block_get(struct scrub_block *sblock);
239 static void scrub_block_put(struct scrub_block *sblock);
240 static void scrub_page_get(struct scrub_page *spage);
241 static void scrub_page_put(struct scrub_page *spage);
242 static void scrub_parity_get(struct scrub_parity *sparity);
243 static void scrub_parity_put(struct scrub_parity *sparity);
244 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
245                                     struct scrub_page *spage);
246 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
247                        u64 physical, struct btrfs_device *dev, u64 flags,
248                        u64 gen, int mirror_num, u8 *csum, int force,
249                        u64 physical_for_dev_replace);
250 static void scrub_bio_end_io(struct bio *bio);
251 static void scrub_bio_end_io_worker(struct btrfs_work *work);
252 static void scrub_block_complete(struct scrub_block *sblock);
253 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
254                                u64 extent_logical, u64 extent_len,
255                                u64 *extent_physical,
256                                struct btrfs_device **extent_dev,
257                                int *extent_mirror_num);
258 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
259                                     struct scrub_page *spage);
260 static void scrub_wr_submit(struct scrub_ctx *sctx);
261 static void scrub_wr_bio_end_io(struct bio *bio);
262 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
263 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
264 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
265 static void scrub_put_ctx(struct scrub_ctx *sctx);
266
267 static inline int scrub_is_page_on_raid56(struct scrub_page *page)
268 {
269         return page->recover &&
270                (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
271 }
272
273 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
274 {
275         refcount_inc(&sctx->refs);
276         atomic_inc(&sctx->bios_in_flight);
277 }
278
279 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
280 {
281         atomic_dec(&sctx->bios_in_flight);
282         wake_up(&sctx->list_wait);
283         scrub_put_ctx(sctx);
284 }
285
286 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
287 {
288         while (atomic_read(&fs_info->scrub_pause_req)) {
289                 mutex_unlock(&fs_info->scrub_lock);
290                 wait_event(fs_info->scrub_pause_wait,
291                    atomic_read(&fs_info->scrub_pause_req) == 0);
292                 mutex_lock(&fs_info->scrub_lock);
293         }
294 }
295
296 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
297 {
298         atomic_inc(&fs_info->scrubs_paused);
299         wake_up(&fs_info->scrub_pause_wait);
300 }
301
302 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
303 {
304         mutex_lock(&fs_info->scrub_lock);
305         __scrub_blocked_if_needed(fs_info);
306         atomic_dec(&fs_info->scrubs_paused);
307         mutex_unlock(&fs_info->scrub_lock);
308
309         wake_up(&fs_info->scrub_pause_wait);
310 }
311
312 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
313 {
314         scrub_pause_on(fs_info);
315         scrub_pause_off(fs_info);
316 }
317
318 /*
319  * Insert new full stripe lock into full stripe locks tree
320  *
321  * Return pointer to existing or newly inserted full_stripe_lock structure if
322  * everything works well.
323  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
324  *
325  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
326  * function
327  */
328 static struct full_stripe_lock *insert_full_stripe_lock(
329                 struct btrfs_full_stripe_locks_tree *locks_root,
330                 u64 fstripe_logical)
331 {
332         struct rb_node **p;
333         struct rb_node *parent = NULL;
334         struct full_stripe_lock *entry;
335         struct full_stripe_lock *ret;
336
337         lockdep_assert_held(&locks_root->lock);
338
339         p = &locks_root->root.rb_node;
340         while (*p) {
341                 parent = *p;
342                 entry = rb_entry(parent, struct full_stripe_lock, node);
343                 if (fstripe_logical < entry->logical) {
344                         p = &(*p)->rb_left;
345                 } else if (fstripe_logical > entry->logical) {
346                         p = &(*p)->rb_right;
347                 } else {
348                         entry->refs++;
349                         return entry;
350                 }
351         }
352
353         /* Insert new lock */
354         ret = kmalloc(sizeof(*ret), GFP_KERNEL);
355         if (!ret)
356                 return ERR_PTR(-ENOMEM);
357         ret->logical = fstripe_logical;
358         ret->refs = 1;
359         mutex_init(&ret->mutex);
360
361         rb_link_node(&ret->node, parent, p);
362         rb_insert_color(&ret->node, &locks_root->root);
363         return ret;
364 }
365
366 /*
367  * Search for a full stripe lock of a block group
368  *
369  * Return pointer to existing full stripe lock if found
370  * Return NULL if not found
371  */
372 static struct full_stripe_lock *search_full_stripe_lock(
373                 struct btrfs_full_stripe_locks_tree *locks_root,
374                 u64 fstripe_logical)
375 {
376         struct rb_node *node;
377         struct full_stripe_lock *entry;
378
379         lockdep_assert_held(&locks_root->lock);
380
381         node = locks_root->root.rb_node;
382         while (node) {
383                 entry = rb_entry(node, struct full_stripe_lock, node);
384                 if (fstripe_logical < entry->logical)
385                         node = node->rb_left;
386                 else if (fstripe_logical > entry->logical)
387                         node = node->rb_right;
388                 else
389                         return entry;
390         }
391         return NULL;
392 }
393
394 /*
395  * Helper to get full stripe logical from a normal bytenr.
396  *
397  * Caller must ensure @cache is a RAID56 block group.
398  */
399 static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
400                                    u64 bytenr)
401 {
402         u64 ret;
403
404         /*
405          * Due to chunk item size limit, full stripe length should not be
406          * larger than U32_MAX. Just a sanity check here.
407          */
408         WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
409
410         /*
411          * round_down() can only handle power of 2, while RAID56 full
412          * stripe length can be 64KiB * n, so we need to manually round down.
413          */
414         ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
415                 cache->full_stripe_len + cache->key.objectid;
416         return ret;
417 }
418
419 /*
420  * Lock a full stripe to avoid concurrency of recovery and read
421  *
422  * It's only used for profiles with parities (RAID5/6), for other profiles it
423  * does nothing.
424  *
425  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
426  * So caller must call unlock_full_stripe() at the same context.
427  *
428  * Return <0 if encounters error.
429  */
430 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
431                             bool *locked_ret)
432 {
433         struct btrfs_block_group_cache *bg_cache;
434         struct btrfs_full_stripe_locks_tree *locks_root;
435         struct full_stripe_lock *existing;
436         u64 fstripe_start;
437         int ret = 0;
438
439         *locked_ret = false;
440         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
441         if (!bg_cache) {
442                 ASSERT(0);
443                 return -ENOENT;
444         }
445
446         /* Profiles not based on parity don't need full stripe lock */
447         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
448                 goto out;
449         locks_root = &bg_cache->full_stripe_locks_root;
450
451         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
452
453         /* Now insert the full stripe lock */
454         mutex_lock(&locks_root->lock);
455         existing = insert_full_stripe_lock(locks_root, fstripe_start);
456         mutex_unlock(&locks_root->lock);
457         if (IS_ERR(existing)) {
458                 ret = PTR_ERR(existing);
459                 goto out;
460         }
461         mutex_lock(&existing->mutex);
462         *locked_ret = true;
463 out:
464         btrfs_put_block_group(bg_cache);
465         return ret;
466 }
467
468 /*
469  * Unlock a full stripe.
470  *
471  * NOTE: Caller must ensure it's the same context calling corresponding
472  * lock_full_stripe().
473  *
474  * Return 0 if we unlock full stripe without problem.
475  * Return <0 for error
476  */
477 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
478                               bool locked)
479 {
480         struct btrfs_block_group_cache *bg_cache;
481         struct btrfs_full_stripe_locks_tree *locks_root;
482         struct full_stripe_lock *fstripe_lock;
483         u64 fstripe_start;
484         bool freeit = false;
485         int ret = 0;
486
487         /* If we didn't acquire full stripe lock, no need to continue */
488         if (!locked)
489                 return 0;
490
491         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
492         if (!bg_cache) {
493                 ASSERT(0);
494                 return -ENOENT;
495         }
496         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
497                 goto out;
498
499         locks_root = &bg_cache->full_stripe_locks_root;
500         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
501
502         mutex_lock(&locks_root->lock);
503         fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
504         /* Unpaired unlock_full_stripe() detected */
505         if (!fstripe_lock) {
506                 WARN_ON(1);
507                 ret = -ENOENT;
508                 mutex_unlock(&locks_root->lock);
509                 goto out;
510         }
511
512         if (fstripe_lock->refs == 0) {
513                 WARN_ON(1);
514                 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
515                         fstripe_lock->logical);
516         } else {
517                 fstripe_lock->refs--;
518         }
519
520         if (fstripe_lock->refs == 0) {
521                 rb_erase(&fstripe_lock->node, &locks_root->root);
522                 freeit = true;
523         }
524         mutex_unlock(&locks_root->lock);
525
526         mutex_unlock(&fstripe_lock->mutex);
527         if (freeit)
528                 kfree(fstripe_lock);
529 out:
530         btrfs_put_block_group(bg_cache);
531         return ret;
532 }
533
534 /*
535  * used for workers that require transaction commits (i.e., for the
536  * NOCOW case)
537  */
538 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
539 {
540         struct btrfs_fs_info *fs_info = sctx->fs_info;
541
542         refcount_inc(&sctx->refs);
543         /*
544          * increment scrubs_running to prevent cancel requests from
545          * completing as long as a worker is running. we must also
546          * increment scrubs_paused to prevent deadlocking on pause
547          * requests used for transactions commits (as the worker uses a
548          * transaction context). it is safe to regard the worker
549          * as paused for all matters practical. effectively, we only
550          * avoid cancellation requests from completing.
551          */
552         mutex_lock(&fs_info->scrub_lock);
553         atomic_inc(&fs_info->scrubs_running);
554         atomic_inc(&fs_info->scrubs_paused);
555         mutex_unlock(&fs_info->scrub_lock);
556
557         /*
558          * check if @scrubs_running=@scrubs_paused condition
559          * inside wait_event() is not an atomic operation.
560          * which means we may inc/dec @scrub_running/paused
561          * at any time. Let's wake up @scrub_pause_wait as
562          * much as we can to let commit transaction blocked less.
563          */
564         wake_up(&fs_info->scrub_pause_wait);
565
566         atomic_inc(&sctx->workers_pending);
567 }
568
569 /* used for workers that require transaction commits */
570 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
571 {
572         struct btrfs_fs_info *fs_info = sctx->fs_info;
573
574         /*
575          * see scrub_pending_trans_workers_inc() why we're pretending
576          * to be paused in the scrub counters
577          */
578         mutex_lock(&fs_info->scrub_lock);
579         atomic_dec(&fs_info->scrubs_running);
580         atomic_dec(&fs_info->scrubs_paused);
581         mutex_unlock(&fs_info->scrub_lock);
582         atomic_dec(&sctx->workers_pending);
583         wake_up(&fs_info->scrub_pause_wait);
584         wake_up(&sctx->list_wait);
585         scrub_put_ctx(sctx);
586 }
587
588 static void scrub_free_csums(struct scrub_ctx *sctx)
589 {
590         while (!list_empty(&sctx->csum_list)) {
591                 struct btrfs_ordered_sum *sum;
592                 sum = list_first_entry(&sctx->csum_list,
593                                        struct btrfs_ordered_sum, list);
594                 list_del(&sum->list);
595                 kfree(sum);
596         }
597 }
598
599 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
600 {
601         int i;
602
603         if (!sctx)
604                 return;
605
606         /* this can happen when scrub is cancelled */
607         if (sctx->curr != -1) {
608                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
609
610                 for (i = 0; i < sbio->page_count; i++) {
611                         WARN_ON(!sbio->pagev[i]->page);
612                         scrub_block_put(sbio->pagev[i]->sblock);
613                 }
614                 bio_put(sbio->bio);
615         }
616
617         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
618                 struct scrub_bio *sbio = sctx->bios[i];
619
620                 if (!sbio)
621                         break;
622                 kfree(sbio);
623         }
624
625         kfree(sctx->wr_curr_bio);
626         scrub_free_csums(sctx);
627         kfree(sctx);
628 }
629
630 static void scrub_put_ctx(struct scrub_ctx *sctx)
631 {
632         if (refcount_dec_and_test(&sctx->refs))
633                 scrub_free_ctx(sctx);
634 }
635
636 static noinline_for_stack
637 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
638 {
639         struct scrub_ctx *sctx;
640         int             i;
641         struct btrfs_fs_info *fs_info = dev->fs_info;
642
643         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
644         if (!sctx)
645                 goto nomem;
646         refcount_set(&sctx->refs, 1);
647         sctx->is_dev_replace = is_dev_replace;
648         sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
649         sctx->curr = -1;
650         sctx->fs_info = dev->fs_info;
651         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
652                 struct scrub_bio *sbio;
653
654                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
655                 if (!sbio)
656                         goto nomem;
657                 sctx->bios[i] = sbio;
658
659                 sbio->index = i;
660                 sbio->sctx = sctx;
661                 sbio->page_count = 0;
662                 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
663                                 scrub_bio_end_io_worker, NULL, NULL);
664
665                 if (i != SCRUB_BIOS_PER_SCTX - 1)
666                         sctx->bios[i]->next_free = i + 1;
667                 else
668                         sctx->bios[i]->next_free = -1;
669         }
670         sctx->first_free = 0;
671         atomic_set(&sctx->bios_in_flight, 0);
672         atomic_set(&sctx->workers_pending, 0);
673         atomic_set(&sctx->cancel_req, 0);
674         sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
675         INIT_LIST_HEAD(&sctx->csum_list);
676
677         spin_lock_init(&sctx->list_lock);
678         spin_lock_init(&sctx->stat_lock);
679         init_waitqueue_head(&sctx->list_wait);
680
681         WARN_ON(sctx->wr_curr_bio != NULL);
682         mutex_init(&sctx->wr_lock);
683         sctx->wr_curr_bio = NULL;
684         if (is_dev_replace) {
685                 WARN_ON(!fs_info->dev_replace.tgtdev);
686                 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
687                 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
688                 sctx->flush_all_writes = false;
689         }
690
691         return sctx;
692
693 nomem:
694         scrub_free_ctx(sctx);
695         return ERR_PTR(-ENOMEM);
696 }
697
698 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
699                                      void *warn_ctx)
700 {
701         u64 isize;
702         u32 nlink;
703         int ret;
704         int i;
705         unsigned nofs_flag;
706         struct extent_buffer *eb;
707         struct btrfs_inode_item *inode_item;
708         struct scrub_warning *swarn = warn_ctx;
709         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
710         struct inode_fs_paths *ipath = NULL;
711         struct btrfs_root *local_root;
712         struct btrfs_key root_key;
713         struct btrfs_key key;
714
715         root_key.objectid = root;
716         root_key.type = BTRFS_ROOT_ITEM_KEY;
717         root_key.offset = (u64)-1;
718         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
719         if (IS_ERR(local_root)) {
720                 ret = PTR_ERR(local_root);
721                 goto err;
722         }
723
724         /*
725          * this makes the path point to (inum INODE_ITEM ioff)
726          */
727         key.objectid = inum;
728         key.type = BTRFS_INODE_ITEM_KEY;
729         key.offset = 0;
730
731         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
732         if (ret) {
733                 btrfs_release_path(swarn->path);
734                 goto err;
735         }
736
737         eb = swarn->path->nodes[0];
738         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
739                                         struct btrfs_inode_item);
740         isize = btrfs_inode_size(eb, inode_item);
741         nlink = btrfs_inode_nlink(eb, inode_item);
742         btrfs_release_path(swarn->path);
743
744         /*
745          * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
746          * uses GFP_NOFS in this context, so we keep it consistent but it does
747          * not seem to be strictly necessary.
748          */
749         nofs_flag = memalloc_nofs_save();
750         ipath = init_ipath(4096, local_root, swarn->path);
751         memalloc_nofs_restore(nofs_flag);
752         if (IS_ERR(ipath)) {
753                 ret = PTR_ERR(ipath);
754                 ipath = NULL;
755                 goto err;
756         }
757         ret = paths_from_inode(inum, ipath);
758
759         if (ret < 0)
760                 goto err;
761
762         /*
763          * we deliberately ignore the bit ipath might have been too small to
764          * hold all of the paths here
765          */
766         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
767                 btrfs_warn_in_rcu(fs_info,
768 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
769                                   swarn->errstr, swarn->logical,
770                                   rcu_str_deref(swarn->dev->name),
771                                   swarn->physical,
772                                   root, inum, offset,
773                                   min(isize - offset, (u64)PAGE_SIZE), nlink,
774                                   (char *)(unsigned long)ipath->fspath->val[i]);
775
776         free_ipath(ipath);
777         return 0;
778
779 err:
780         btrfs_warn_in_rcu(fs_info,
781                           "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
782                           swarn->errstr, swarn->logical,
783                           rcu_str_deref(swarn->dev->name),
784                           swarn->physical,
785                           root, inum, offset, ret);
786
787         free_ipath(ipath);
788         return 0;
789 }
790
791 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
792 {
793         struct btrfs_device *dev;
794         struct btrfs_fs_info *fs_info;
795         struct btrfs_path *path;
796         struct btrfs_key found_key;
797         struct extent_buffer *eb;
798         struct btrfs_extent_item *ei;
799         struct scrub_warning swarn;
800         unsigned long ptr = 0;
801         u64 extent_item_pos;
802         u64 flags = 0;
803         u64 ref_root;
804         u32 item_size;
805         u8 ref_level = 0;
806         int ret;
807
808         WARN_ON(sblock->page_count < 1);
809         dev = sblock->pagev[0]->dev;
810         fs_info = sblock->sctx->fs_info;
811
812         path = btrfs_alloc_path();
813         if (!path)
814                 return;
815
816         swarn.physical = sblock->pagev[0]->physical;
817         swarn.logical = sblock->pagev[0]->logical;
818         swarn.errstr = errstr;
819         swarn.dev = NULL;
820
821         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
822                                   &flags);
823         if (ret < 0)
824                 goto out;
825
826         extent_item_pos = swarn.logical - found_key.objectid;
827         swarn.extent_item_size = found_key.offset;
828
829         eb = path->nodes[0];
830         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
831         item_size = btrfs_item_size_nr(eb, path->slots[0]);
832
833         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
834                 do {
835                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
836                                                       item_size, &ref_root,
837                                                       &ref_level);
838                         btrfs_warn_in_rcu(fs_info,
839 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
840                                 errstr, swarn.logical,
841                                 rcu_str_deref(dev->name),
842                                 swarn.physical,
843                                 ref_level ? "node" : "leaf",
844                                 ret < 0 ? -1 : ref_level,
845                                 ret < 0 ? -1 : ref_root);
846                 } while (ret != 1);
847                 btrfs_release_path(path);
848         } else {
849                 btrfs_release_path(path);
850                 swarn.path = path;
851                 swarn.dev = dev;
852                 iterate_extent_inodes(fs_info, found_key.objectid,
853                                         extent_item_pos, 1,
854                                         scrub_print_warning_inode, &swarn, false);
855         }
856
857 out:
858         btrfs_free_path(path);
859 }
860
861 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
862 {
863         struct page *page = NULL;
864         unsigned long index;
865         struct scrub_fixup_nodatasum *fixup = fixup_ctx;
866         int ret;
867         int corrected = 0;
868         struct btrfs_key key;
869         struct inode *inode = NULL;
870         struct btrfs_fs_info *fs_info;
871         u64 end = offset + PAGE_SIZE - 1;
872         struct btrfs_root *local_root;
873         int srcu_index;
874
875         key.objectid = root;
876         key.type = BTRFS_ROOT_ITEM_KEY;
877         key.offset = (u64)-1;
878
879         fs_info = fixup->root->fs_info;
880         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
881
882         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
883         if (IS_ERR(local_root)) {
884                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
885                 return PTR_ERR(local_root);
886         }
887
888         key.type = BTRFS_INODE_ITEM_KEY;
889         key.objectid = inum;
890         key.offset = 0;
891         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
892         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
893         if (IS_ERR(inode))
894                 return PTR_ERR(inode);
895
896         index = offset >> PAGE_SHIFT;
897
898         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
899         if (!page) {
900                 ret = -ENOMEM;
901                 goto out;
902         }
903
904         if (PageUptodate(page)) {
905                 if (PageDirty(page)) {
906                         /*
907                          * we need to write the data to the defect sector. the
908                          * data that was in that sector is not in memory,
909                          * because the page was modified. we must not write the
910                          * modified page to that sector.
911                          *
912                          * TODO: what could be done here: wait for the delalloc
913                          *       runner to write out that page (might involve
914                          *       COW) and see whether the sector is still
915                          *       referenced afterwards.
916                          *
917                          * For the meantime, we'll treat this error
918                          * incorrectable, although there is a chance that a
919                          * later scrub will find the bad sector again and that
920                          * there's no dirty page in memory, then.
921                          */
922                         ret = -EIO;
923                         goto out;
924                 }
925                 ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE,
926                                         fixup->logical, page,
927                                         offset - page_offset(page),
928                                         fixup->mirror_num);
929                 unlock_page(page);
930                 corrected = !ret;
931         } else {
932                 /*
933                  * we need to get good data first. the general readpage path
934                  * will call repair_io_failure for us, we just have to make
935                  * sure we read the bad mirror.
936                  */
937                 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
938                                         EXTENT_DAMAGED);
939                 if (ret) {
940                         /* set_extent_bits should give proper error */
941                         WARN_ON(ret > 0);
942                         if (ret > 0)
943                                 ret = -EFAULT;
944                         goto out;
945                 }
946
947                 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
948                                                 btrfs_get_extent,
949                                                 fixup->mirror_num);
950                 wait_on_page_locked(page);
951
952                 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
953                                                 end, EXTENT_DAMAGED, 0, NULL);
954                 if (!corrected)
955                         clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
956                                                 EXTENT_DAMAGED);
957         }
958
959 out:
960         if (page)
961                 put_page(page);
962
963         iput(inode);
964
965         if (ret < 0)
966                 return ret;
967
968         if (ret == 0 && corrected) {
969                 /*
970                  * we only need to call readpage for one of the inodes belonging
971                  * to this extent. so make iterate_extent_inodes stop
972                  */
973                 return 1;
974         }
975
976         return -EIO;
977 }
978
979 static void scrub_fixup_nodatasum(struct btrfs_work *work)
980 {
981         struct btrfs_fs_info *fs_info;
982         int ret;
983         struct scrub_fixup_nodatasum *fixup;
984         struct scrub_ctx *sctx;
985         struct btrfs_trans_handle *trans = NULL;
986         struct btrfs_path *path;
987         int uncorrectable = 0;
988
989         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
990         sctx = fixup->sctx;
991         fs_info = fixup->root->fs_info;
992
993         path = btrfs_alloc_path();
994         if (!path) {
995                 spin_lock(&sctx->stat_lock);
996                 ++sctx->stat.malloc_errors;
997                 spin_unlock(&sctx->stat_lock);
998                 uncorrectable = 1;
999                 goto out;
1000         }
1001
1002         trans = btrfs_join_transaction(fixup->root);
1003         if (IS_ERR(trans)) {
1004                 uncorrectable = 1;
1005                 goto out;
1006         }
1007
1008         /*
1009          * the idea is to trigger a regular read through the standard path. we
1010          * read a page from the (failed) logical address by specifying the
1011          * corresponding copynum of the failed sector. thus, that readpage is
1012          * expected to fail.
1013          * that is the point where on-the-fly error correction will kick in
1014          * (once it's finished) and rewrite the failed sector if a good copy
1015          * can be found.
1016          */
1017         ret = iterate_inodes_from_logical(fixup->logical, fs_info, path,
1018                                           scrub_fixup_readpage, fixup, false);
1019         if (ret < 0) {
1020                 uncorrectable = 1;
1021                 goto out;
1022         }
1023         WARN_ON(ret != 1);
1024
1025         spin_lock(&sctx->stat_lock);
1026         ++sctx->stat.corrected_errors;
1027         spin_unlock(&sctx->stat_lock);
1028
1029 out:
1030         if (trans && !IS_ERR(trans))
1031                 btrfs_end_transaction(trans);
1032         if (uncorrectable) {
1033                 spin_lock(&sctx->stat_lock);
1034                 ++sctx->stat.uncorrectable_errors;
1035                 spin_unlock(&sctx->stat_lock);
1036                 btrfs_dev_replace_stats_inc(
1037                         &fs_info->dev_replace.num_uncorrectable_read_errors);
1038                 btrfs_err_rl_in_rcu(fs_info,
1039                     "unable to fixup (nodatasum) error at logical %llu on dev %s",
1040                         fixup->logical, rcu_str_deref(fixup->dev->name));
1041         }
1042
1043         btrfs_free_path(path);
1044         kfree(fixup);
1045
1046         scrub_pending_trans_workers_dec(sctx);
1047 }
1048
1049 static inline void scrub_get_recover(struct scrub_recover *recover)
1050 {
1051         refcount_inc(&recover->refs);
1052 }
1053
1054 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
1055                                      struct scrub_recover *recover)
1056 {
1057         if (refcount_dec_and_test(&recover->refs)) {
1058                 btrfs_bio_counter_dec(fs_info);
1059                 btrfs_put_bbio(recover->bbio);
1060                 kfree(recover);
1061         }
1062 }
1063
1064 /*
1065  * scrub_handle_errored_block gets called when either verification of the
1066  * pages failed or the bio failed to read, e.g. with EIO. In the latter
1067  * case, this function handles all pages in the bio, even though only one
1068  * may be bad.
1069  * The goal of this function is to repair the errored block by using the
1070  * contents of one of the mirrors.
1071  */
1072 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
1073 {
1074         struct scrub_ctx *sctx = sblock_to_check->sctx;
1075         struct btrfs_device *dev;
1076         struct btrfs_fs_info *fs_info;
1077         u64 logical;
1078         unsigned int failed_mirror_index;
1079         unsigned int is_metadata;
1080         unsigned int have_csum;
1081         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
1082         struct scrub_block *sblock_bad;
1083         int ret;
1084         int mirror_index;
1085         int page_num;
1086         int success;
1087         bool full_stripe_locked;
1088         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1089                                       DEFAULT_RATELIMIT_BURST);
1090
1091         BUG_ON(sblock_to_check->page_count < 1);
1092         fs_info = sctx->fs_info;
1093         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
1094                 /*
1095                  * if we find an error in a super block, we just report it.
1096                  * They will get written with the next transaction commit
1097                  * anyway
1098                  */
1099                 spin_lock(&sctx->stat_lock);
1100                 ++sctx->stat.super_errors;
1101                 spin_unlock(&sctx->stat_lock);
1102                 return 0;
1103         }
1104         logical = sblock_to_check->pagev[0]->logical;
1105         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
1106         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
1107         is_metadata = !(sblock_to_check->pagev[0]->flags &
1108                         BTRFS_EXTENT_FLAG_DATA);
1109         have_csum = sblock_to_check->pagev[0]->have_csum;
1110         dev = sblock_to_check->pagev[0]->dev;
1111
1112         /*
1113          * For RAID5/6, race can happen for a different device scrub thread.
1114          * For data corruption, Parity and Data threads will both try
1115          * to recovery the data.
1116          * Race can lead to doubly added csum error, or even unrecoverable
1117          * error.
1118          */
1119         ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
1120         if (ret < 0) {
1121                 spin_lock(&sctx->stat_lock);
1122                 if (ret == -ENOMEM)
1123                         sctx->stat.malloc_errors++;
1124                 sctx->stat.read_errors++;
1125                 sctx->stat.uncorrectable_errors++;
1126                 spin_unlock(&sctx->stat_lock);
1127                 return ret;
1128         }
1129
1130         /*
1131          * read all mirrors one after the other. This includes to
1132          * re-read the extent or metadata block that failed (that was
1133          * the cause that this fixup code is called) another time,
1134          * page by page this time in order to know which pages
1135          * caused I/O errors and which ones are good (for all mirrors).
1136          * It is the goal to handle the situation when more than one
1137          * mirror contains I/O errors, but the errors do not
1138          * overlap, i.e. the data can be repaired by selecting the
1139          * pages from those mirrors without I/O error on the
1140          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
1141          * would be that mirror #1 has an I/O error on the first page,
1142          * the second page is good, and mirror #2 has an I/O error on
1143          * the second page, but the first page is good.
1144          * Then the first page of the first mirror can be repaired by
1145          * taking the first page of the second mirror, and the
1146          * second page of the second mirror can be repaired by
1147          * copying the contents of the 2nd page of the 1st mirror.
1148          * One more note: if the pages of one mirror contain I/O
1149          * errors, the checksum cannot be verified. In order to get
1150          * the best data for repairing, the first attempt is to find
1151          * a mirror without I/O errors and with a validated checksum.
1152          * Only if this is not possible, the pages are picked from
1153          * mirrors with I/O errors without considering the checksum.
1154          * If the latter is the case, at the end, the checksum of the
1155          * repaired area is verified in order to correctly maintain
1156          * the statistics.
1157          */
1158
1159         sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
1160                                       sizeof(*sblocks_for_recheck), GFP_NOFS);
1161         if (!sblocks_for_recheck) {
1162                 spin_lock(&sctx->stat_lock);
1163                 sctx->stat.malloc_errors++;
1164                 sctx->stat.read_errors++;
1165                 sctx->stat.uncorrectable_errors++;
1166                 spin_unlock(&sctx->stat_lock);
1167                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1168                 goto out;
1169         }
1170
1171         /* setup the context, map the logical blocks and alloc the pages */
1172         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
1173         if (ret) {
1174                 spin_lock(&sctx->stat_lock);
1175                 sctx->stat.read_errors++;
1176                 sctx->stat.uncorrectable_errors++;
1177                 spin_unlock(&sctx->stat_lock);
1178                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1179                 goto out;
1180         }
1181         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
1182         sblock_bad = sblocks_for_recheck + failed_mirror_index;
1183
1184         /* build and submit the bios for the failed mirror, check checksums */
1185         scrub_recheck_block(fs_info, sblock_bad, 1);
1186
1187         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
1188             sblock_bad->no_io_error_seen) {
1189                 /*
1190                  * the error disappeared after reading page by page, or
1191                  * the area was part of a huge bio and other parts of the
1192                  * bio caused I/O errors, or the block layer merged several
1193                  * read requests into one and the error is caused by a
1194                  * different bio (usually one of the two latter cases is
1195                  * the cause)
1196                  */
1197                 spin_lock(&sctx->stat_lock);
1198                 sctx->stat.unverified_errors++;
1199                 sblock_to_check->data_corrected = 1;
1200                 spin_unlock(&sctx->stat_lock);
1201
1202                 if (sctx->is_dev_replace)
1203                         scrub_write_block_to_dev_replace(sblock_bad);
1204                 goto out;
1205         }
1206
1207         if (!sblock_bad->no_io_error_seen) {
1208                 spin_lock(&sctx->stat_lock);
1209                 sctx->stat.read_errors++;
1210                 spin_unlock(&sctx->stat_lock);
1211                 if (__ratelimit(&_rs))
1212                         scrub_print_warning("i/o error", sblock_to_check);
1213                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1214         } else if (sblock_bad->checksum_error) {
1215                 spin_lock(&sctx->stat_lock);
1216                 sctx->stat.csum_errors++;
1217                 spin_unlock(&sctx->stat_lock);
1218                 if (__ratelimit(&_rs))
1219                         scrub_print_warning("checksum error", sblock_to_check);
1220                 btrfs_dev_stat_inc_and_print(dev,
1221                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
1222         } else if (sblock_bad->header_error) {
1223                 spin_lock(&sctx->stat_lock);
1224                 sctx->stat.verify_errors++;
1225                 spin_unlock(&sctx->stat_lock);
1226                 if (__ratelimit(&_rs))
1227                         scrub_print_warning("checksum/header error",
1228                                             sblock_to_check);
1229                 if (sblock_bad->generation_error)
1230                         btrfs_dev_stat_inc_and_print(dev,
1231                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1232                 else
1233                         btrfs_dev_stat_inc_and_print(dev,
1234                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1235         }
1236
1237         if (sctx->readonly) {
1238                 ASSERT(!sctx->is_dev_replace);
1239                 goto out;
1240         }
1241
1242         /*
1243          * NOTE: Even for nodatasum case, it's still possible that it's a
1244          * compressed data extent, thus scrub_fixup_nodatasum(), which write
1245          * inode page cache onto disk, could cause serious data corruption.
1246          *
1247          * So here we could only read from disk, and hope our recovery could
1248          * reach disk before the newer write.
1249          */
1250         if (0 && !is_metadata && !have_csum) {
1251                 struct scrub_fixup_nodatasum *fixup_nodatasum;
1252
1253                 WARN_ON(sctx->is_dev_replace);
1254
1255                 /*
1256                  * !is_metadata and !have_csum, this means that the data
1257                  * might not be COWed, that it might be modified
1258                  * concurrently. The general strategy to work on the
1259                  * commit root does not help in the case when COW is not
1260                  * used.
1261                  */
1262                 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1263                 if (!fixup_nodatasum)
1264                         goto did_not_correct_error;
1265                 fixup_nodatasum->sctx = sctx;
1266                 fixup_nodatasum->dev = dev;
1267                 fixup_nodatasum->logical = logical;
1268                 fixup_nodatasum->root = fs_info->extent_root;
1269                 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1270                 scrub_pending_trans_workers_inc(sctx);
1271                 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1272                                 scrub_fixup_nodatasum, NULL, NULL);
1273                 btrfs_queue_work(fs_info->scrub_workers,
1274                                  &fixup_nodatasum->work);
1275                 goto out;
1276         }
1277
1278         /*
1279          * now build and submit the bios for the other mirrors, check
1280          * checksums.
1281          * First try to pick the mirror which is completely without I/O
1282          * errors and also does not have a checksum error.
1283          * If one is found, and if a checksum is present, the full block
1284          * that is known to contain an error is rewritten. Afterwards
1285          * the block is known to be corrected.
1286          * If a mirror is found which is completely correct, and no
1287          * checksum is present, only those pages are rewritten that had
1288          * an I/O error in the block to be repaired, since it cannot be
1289          * determined, which copy of the other pages is better (and it
1290          * could happen otherwise that a correct page would be
1291          * overwritten by a bad one).
1292          */
1293         for (mirror_index = 0; ;mirror_index++) {
1294                 struct scrub_block *sblock_other;
1295
1296                 if (mirror_index == failed_mirror_index)
1297                         continue;
1298
1299                 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1300                 if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1301                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1302                                 break;
1303                         if (!sblocks_for_recheck[mirror_index].page_count)
1304                                 break;
1305
1306                         sblock_other = sblocks_for_recheck + mirror_index;
1307                 } else {
1308                         struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1309                         int max_allowed = r->bbio->num_stripes -
1310                                                 r->bbio->num_tgtdevs;
1311
1312                         if (mirror_index >= max_allowed)
1313                                 break;
1314                         if (!sblocks_for_recheck[1].page_count)
1315                                 break;
1316
1317                         ASSERT(failed_mirror_index == 0);
1318                         sblock_other = sblocks_for_recheck + 1;
1319                         sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1320                 }
1321
1322                 /* build and submit the bios, check checksums */
1323                 scrub_recheck_block(fs_info, sblock_other, 0);
1324
1325                 if (!sblock_other->header_error &&
1326                     !sblock_other->checksum_error &&
1327                     sblock_other->no_io_error_seen) {
1328                         if (sctx->is_dev_replace) {
1329                                 scrub_write_block_to_dev_replace(sblock_other);
1330                                 goto corrected_error;
1331                         } else {
1332                                 ret = scrub_repair_block_from_good_copy(
1333                                                 sblock_bad, sblock_other);
1334                                 if (!ret)
1335                                         goto corrected_error;
1336                         }
1337                 }
1338         }
1339
1340         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1341                 goto did_not_correct_error;
1342
1343         /*
1344          * In case of I/O errors in the area that is supposed to be
1345          * repaired, continue by picking good copies of those pages.
1346          * Select the good pages from mirrors to rewrite bad pages from
1347          * the area to fix. Afterwards verify the checksum of the block
1348          * that is supposed to be repaired. This verification step is
1349          * only done for the purpose of statistic counting and for the
1350          * final scrub report, whether errors remain.
1351          * A perfect algorithm could make use of the checksum and try
1352          * all possible combinations of pages from the different mirrors
1353          * until the checksum verification succeeds. For example, when
1354          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1355          * of mirror #2 is readable but the final checksum test fails,
1356          * then the 2nd page of mirror #3 could be tried, whether now
1357          * the final checksum succeeds. But this would be a rare
1358          * exception and is therefore not implemented. At least it is
1359          * avoided that the good copy is overwritten.
1360          * A more useful improvement would be to pick the sectors
1361          * without I/O error based on sector sizes (512 bytes on legacy
1362          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1363          * mirror could be repaired by taking 512 byte of a different
1364          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1365          * area are unreadable.
1366          */
1367         success = 1;
1368         for (page_num = 0; page_num < sblock_bad->page_count;
1369              page_num++) {
1370                 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1371                 struct scrub_block *sblock_other = NULL;
1372
1373                 /* skip no-io-error page in scrub */
1374                 if (!page_bad->io_error && !sctx->is_dev_replace)
1375                         continue;
1376
1377                 if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1378                         /*
1379                          * In case of dev replace, if raid56 rebuild process
1380                          * didn't work out correct data, then copy the content
1381                          * in sblock_bad to make sure target device is identical
1382                          * to source device, instead of writing garbage data in
1383                          * sblock_for_recheck array to target device.
1384                          */
1385                         sblock_other = NULL;
1386                 } else if (page_bad->io_error) {
1387                         /* try to find no-io-error page in mirrors */
1388                         for (mirror_index = 0;
1389                              mirror_index < BTRFS_MAX_MIRRORS &&
1390                              sblocks_for_recheck[mirror_index].page_count > 0;
1391                              mirror_index++) {
1392                                 if (!sblocks_for_recheck[mirror_index].
1393                                     pagev[page_num]->io_error) {
1394                                         sblock_other = sblocks_for_recheck +
1395                                                        mirror_index;
1396                                         break;
1397                                 }
1398                         }
1399                         if (!sblock_other)
1400                                 success = 0;
1401                 }
1402
1403                 if (sctx->is_dev_replace) {
1404                         /*
1405                          * did not find a mirror to fetch the page
1406                          * from. scrub_write_page_to_dev_replace()
1407                          * handles this case (page->io_error), by
1408                          * filling the block with zeros before
1409                          * submitting the write request
1410                          */
1411                         if (!sblock_other)
1412                                 sblock_other = sblock_bad;
1413
1414                         if (scrub_write_page_to_dev_replace(sblock_other,
1415                                                             page_num) != 0) {
1416                                 btrfs_dev_replace_stats_inc(
1417                                         &fs_info->dev_replace.num_write_errors);
1418                                 success = 0;
1419                         }
1420                 } else if (sblock_other) {
1421                         ret = scrub_repair_page_from_good_copy(sblock_bad,
1422                                                                sblock_other,
1423                                                                page_num, 0);
1424                         if (0 == ret)
1425                                 page_bad->io_error = 0;
1426                         else
1427                                 success = 0;
1428                 }
1429         }
1430
1431         if (success && !sctx->is_dev_replace) {
1432                 if (is_metadata || have_csum) {
1433                         /*
1434                          * need to verify the checksum now that all
1435                          * sectors on disk are repaired (the write
1436                          * request for data to be repaired is on its way).
1437                          * Just be lazy and use scrub_recheck_block()
1438                          * which re-reads the data before the checksum
1439                          * is verified, but most likely the data comes out
1440                          * of the page cache.
1441                          */
1442                         scrub_recheck_block(fs_info, sblock_bad, 1);
1443                         if (!sblock_bad->header_error &&
1444                             !sblock_bad->checksum_error &&
1445                             sblock_bad->no_io_error_seen)
1446                                 goto corrected_error;
1447                         else
1448                                 goto did_not_correct_error;
1449                 } else {
1450 corrected_error:
1451                         spin_lock(&sctx->stat_lock);
1452                         sctx->stat.corrected_errors++;
1453                         sblock_to_check->data_corrected = 1;
1454                         spin_unlock(&sctx->stat_lock);
1455                         btrfs_err_rl_in_rcu(fs_info,
1456                                 "fixed up error at logical %llu on dev %s",
1457                                 logical, rcu_str_deref(dev->name));
1458                 }
1459         } else {
1460 did_not_correct_error:
1461                 spin_lock(&sctx->stat_lock);
1462                 sctx->stat.uncorrectable_errors++;
1463                 spin_unlock(&sctx->stat_lock);
1464                 btrfs_err_rl_in_rcu(fs_info,
1465                         "unable to fixup (regular) error at logical %llu on dev %s",
1466                         logical, rcu_str_deref(dev->name));
1467         }
1468
1469 out:
1470         if (sblocks_for_recheck) {
1471                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1472                      mirror_index++) {
1473                         struct scrub_block *sblock = sblocks_for_recheck +
1474                                                      mirror_index;
1475                         struct scrub_recover *recover;
1476                         int page_index;
1477
1478                         for (page_index = 0; page_index < sblock->page_count;
1479                              page_index++) {
1480                                 sblock->pagev[page_index]->sblock = NULL;
1481                                 recover = sblock->pagev[page_index]->recover;
1482                                 if (recover) {
1483                                         scrub_put_recover(fs_info, recover);
1484                                         sblock->pagev[page_index]->recover =
1485                                                                         NULL;
1486                                 }
1487                                 scrub_page_put(sblock->pagev[page_index]);
1488                         }
1489                 }
1490                 kfree(sblocks_for_recheck);
1491         }
1492
1493         ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1494         if (ret < 0)
1495                 return ret;
1496         return 0;
1497 }
1498
1499 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1500 {
1501         if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1502                 return 2;
1503         else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1504                 return 3;
1505         else
1506                 return (int)bbio->num_stripes;
1507 }
1508
1509 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1510                                                  u64 *raid_map,
1511                                                  u64 mapped_length,
1512                                                  int nstripes, int mirror,
1513                                                  int *stripe_index,
1514                                                  u64 *stripe_offset)
1515 {
1516         int i;
1517
1518         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1519                 /* RAID5/6 */
1520                 for (i = 0; i < nstripes; i++) {
1521                         if (raid_map[i] == RAID6_Q_STRIPE ||
1522                             raid_map[i] == RAID5_P_STRIPE)
1523                                 continue;
1524
1525                         if (logical >= raid_map[i] &&
1526                             logical < raid_map[i] + mapped_length)
1527                                 break;
1528                 }
1529
1530                 *stripe_index = i;
1531                 *stripe_offset = logical - raid_map[i];
1532         } else {
1533                 /* The other RAID type */
1534                 *stripe_index = mirror;
1535                 *stripe_offset = 0;
1536         }
1537 }
1538
1539 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1540                                      struct scrub_block *sblocks_for_recheck)
1541 {
1542         struct scrub_ctx *sctx = original_sblock->sctx;
1543         struct btrfs_fs_info *fs_info = sctx->fs_info;
1544         u64 length = original_sblock->page_count * PAGE_SIZE;
1545         u64 logical = original_sblock->pagev[0]->logical;
1546         u64 generation = original_sblock->pagev[0]->generation;
1547         u64 flags = original_sblock->pagev[0]->flags;
1548         u64 have_csum = original_sblock->pagev[0]->have_csum;
1549         struct scrub_recover *recover;
1550         struct btrfs_bio *bbio;
1551         u64 sublen;
1552         u64 mapped_length;
1553         u64 stripe_offset;
1554         int stripe_index;
1555         int page_index = 0;
1556         int mirror_index;
1557         int nmirrors;
1558         int ret;
1559
1560         /*
1561          * note: the two members refs and outstanding_pages
1562          * are not used (and not set) in the blocks that are used for
1563          * the recheck procedure
1564          */
1565
1566         while (length > 0) {
1567                 sublen = min_t(u64, length, PAGE_SIZE);
1568                 mapped_length = sublen;
1569                 bbio = NULL;
1570
1571                 /*
1572                  * with a length of PAGE_SIZE, each returned stripe
1573                  * represents one mirror
1574                  */
1575                 btrfs_bio_counter_inc_blocked(fs_info);
1576                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1577                                 logical, &mapped_length, &bbio);
1578                 if (ret || !bbio || mapped_length < sublen) {
1579                         btrfs_put_bbio(bbio);
1580                         btrfs_bio_counter_dec(fs_info);
1581                         return -EIO;
1582                 }
1583
1584                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1585                 if (!recover) {
1586                         btrfs_put_bbio(bbio);
1587                         btrfs_bio_counter_dec(fs_info);
1588                         return -ENOMEM;
1589                 }
1590
1591                 refcount_set(&recover->refs, 1);
1592                 recover->bbio = bbio;
1593                 recover->map_length = mapped_length;
1594
1595                 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
1596
1597                 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1598
1599                 for (mirror_index = 0; mirror_index < nmirrors;
1600                      mirror_index++) {
1601                         struct scrub_block *sblock;
1602                         struct scrub_page *page;
1603
1604                         sblock = sblocks_for_recheck + mirror_index;
1605                         sblock->sctx = sctx;
1606
1607                         page = kzalloc(sizeof(*page), GFP_NOFS);
1608                         if (!page) {
1609 leave_nomem:
1610                                 spin_lock(&sctx->stat_lock);
1611                                 sctx->stat.malloc_errors++;
1612                                 spin_unlock(&sctx->stat_lock);
1613                                 scrub_put_recover(fs_info, recover);
1614                                 return -ENOMEM;
1615                         }
1616                         scrub_page_get(page);
1617                         sblock->pagev[page_index] = page;
1618                         page->sblock = sblock;
1619                         page->flags = flags;
1620                         page->generation = generation;
1621                         page->logical = logical;
1622                         page->have_csum = have_csum;
1623                         if (have_csum)
1624                                 memcpy(page->csum,
1625                                        original_sblock->pagev[0]->csum,
1626                                        sctx->csum_size);
1627
1628                         scrub_stripe_index_and_offset(logical,
1629                                                       bbio->map_type,
1630                                                       bbio->raid_map,
1631                                                       mapped_length,
1632                                                       bbio->num_stripes -
1633                                                       bbio->num_tgtdevs,
1634                                                       mirror_index,
1635                                                       &stripe_index,
1636                                                       &stripe_offset);
1637                         page->physical = bbio->stripes[stripe_index].physical +
1638                                          stripe_offset;
1639                         page->dev = bbio->stripes[stripe_index].dev;
1640
1641                         BUG_ON(page_index >= original_sblock->page_count);
1642                         page->physical_for_dev_replace =
1643                                 original_sblock->pagev[page_index]->
1644                                 physical_for_dev_replace;
1645                         /* for missing devices, dev->bdev is NULL */
1646                         page->mirror_num = mirror_index + 1;
1647                         sblock->page_count++;
1648                         page->page = alloc_page(GFP_NOFS);
1649                         if (!page->page)
1650                                 goto leave_nomem;
1651
1652                         scrub_get_recover(recover);
1653                         page->recover = recover;
1654                 }
1655                 scrub_put_recover(fs_info, recover);
1656                 length -= sublen;
1657                 logical += sublen;
1658                 page_index++;
1659         }
1660
1661         return 0;
1662 }
1663
1664 static void scrub_bio_wait_endio(struct bio *bio)
1665 {
1666         complete(bio->bi_private);
1667 }
1668
1669 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1670                                         struct bio *bio,
1671                                         struct scrub_page *page)
1672 {
1673         DECLARE_COMPLETION_ONSTACK(done);
1674         int ret;
1675         int mirror_num;
1676
1677         bio->bi_iter.bi_sector = page->logical >> 9;
1678         bio->bi_private = &done;
1679         bio->bi_end_io = scrub_bio_wait_endio;
1680
1681         mirror_num = page->sblock->pagev[0]->mirror_num;
1682         ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
1683                                     page->recover->map_length,
1684                                     mirror_num, 0);
1685         if (ret)
1686                 return ret;
1687
1688         wait_for_completion_io(&done);
1689         return blk_status_to_errno(bio->bi_status);
1690 }
1691
1692 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1693                                           struct scrub_block *sblock)
1694 {
1695         struct scrub_page *first_page = sblock->pagev[0];
1696         struct bio *bio;
1697         int page_num;
1698
1699         /* All pages in sblock belong to the same stripe on the same device. */
1700         ASSERT(first_page->dev);
1701         if (!first_page->dev->bdev)
1702                 goto out;
1703
1704         bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
1705         bio_set_dev(bio, first_page->dev->bdev);
1706
1707         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1708                 struct scrub_page *page = sblock->pagev[page_num];
1709
1710                 WARN_ON(!page->page);
1711                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1712         }
1713
1714         if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
1715                 bio_put(bio);
1716                 goto out;
1717         }
1718
1719         bio_put(bio);
1720
1721         scrub_recheck_block_checksum(sblock);
1722
1723         return;
1724 out:
1725         for (page_num = 0; page_num < sblock->page_count; page_num++)
1726                 sblock->pagev[page_num]->io_error = 1;
1727
1728         sblock->no_io_error_seen = 0;
1729 }
1730
1731 /*
1732  * this function will check the on disk data for checksum errors, header
1733  * errors and read I/O errors. If any I/O errors happen, the exact pages
1734  * which are errored are marked as being bad. The goal is to enable scrub
1735  * to take those pages that are not errored from all the mirrors so that
1736  * the pages that are errored in the just handled mirror can be repaired.
1737  */
1738 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1739                                 struct scrub_block *sblock,
1740                                 int retry_failed_mirror)
1741 {
1742         int page_num;
1743
1744         sblock->no_io_error_seen = 1;
1745
1746         /* short cut for raid56 */
1747         if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
1748                 return scrub_recheck_block_on_raid56(fs_info, sblock);
1749
1750         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1751                 struct bio *bio;
1752                 struct scrub_page *page = sblock->pagev[page_num];
1753
1754                 if (page->dev->bdev == NULL) {
1755                         page->io_error = 1;
1756                         sblock->no_io_error_seen = 0;
1757                         continue;
1758                 }
1759
1760                 WARN_ON(!page->page);
1761                 bio = btrfs_io_bio_alloc(1);
1762                 bio_set_dev(bio, page->dev->bdev);
1763
1764                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1765                 bio->bi_iter.bi_sector = page->physical >> 9;
1766                 bio->bi_opf = REQ_OP_READ;
1767
1768                 if (btrfsic_submit_bio_wait(bio)) {
1769                         page->io_error = 1;
1770                         sblock->no_io_error_seen = 0;
1771                 }
1772
1773                 bio_put(bio);
1774         }
1775
1776         if (sblock->no_io_error_seen)
1777                 scrub_recheck_block_checksum(sblock);
1778 }
1779
1780 static inline int scrub_check_fsid(u8 fsid[],
1781                                    struct scrub_page *spage)
1782 {
1783         struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1784         int ret;
1785
1786         ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1787         return !ret;
1788 }
1789
1790 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1791 {
1792         sblock->header_error = 0;
1793         sblock->checksum_error = 0;
1794         sblock->generation_error = 0;
1795
1796         if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1797                 scrub_checksum_data(sblock);
1798         else
1799                 scrub_checksum_tree_block(sblock);
1800 }
1801
1802 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1803                                              struct scrub_block *sblock_good)
1804 {
1805         int page_num;
1806         int ret = 0;
1807
1808         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1809                 int ret_sub;
1810
1811                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1812                                                            sblock_good,
1813                                                            page_num, 1);
1814                 if (ret_sub)
1815                         ret = ret_sub;
1816         }
1817
1818         return ret;
1819 }
1820
1821 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1822                                             struct scrub_block *sblock_good,
1823                                             int page_num, int force_write)
1824 {
1825         struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1826         struct scrub_page *page_good = sblock_good->pagev[page_num];
1827         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1828
1829         BUG_ON(page_bad->page == NULL);
1830         BUG_ON(page_good->page == NULL);
1831         if (force_write || sblock_bad->header_error ||
1832             sblock_bad->checksum_error || page_bad->io_error) {
1833                 struct bio *bio;
1834                 int ret;
1835
1836                 if (!page_bad->dev->bdev) {
1837                         btrfs_warn_rl(fs_info,
1838                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1839                         return -EIO;
1840                 }
1841
1842                 bio = btrfs_io_bio_alloc(1);
1843                 bio_set_dev(bio, page_bad->dev->bdev);
1844                 bio->bi_iter.bi_sector = page_bad->physical >> 9;
1845                 bio->bi_opf = REQ_OP_WRITE;
1846
1847                 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1848                 if (PAGE_SIZE != ret) {
1849                         bio_put(bio);
1850                         return -EIO;
1851                 }
1852
1853                 if (btrfsic_submit_bio_wait(bio)) {
1854                         btrfs_dev_stat_inc_and_print(page_bad->dev,
1855                                 BTRFS_DEV_STAT_WRITE_ERRS);
1856                         btrfs_dev_replace_stats_inc(
1857                                 &fs_info->dev_replace.num_write_errors);
1858                         bio_put(bio);
1859                         return -EIO;
1860                 }
1861                 bio_put(bio);
1862         }
1863
1864         return 0;
1865 }
1866
1867 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1868 {
1869         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1870         int page_num;
1871
1872         /*
1873          * This block is used for the check of the parity on the source device,
1874          * so the data needn't be written into the destination device.
1875          */
1876         if (sblock->sparity)
1877                 return;
1878
1879         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1880                 int ret;
1881
1882                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1883                 if (ret)
1884                         btrfs_dev_replace_stats_inc(
1885                                 &fs_info->dev_replace.num_write_errors);
1886         }
1887 }
1888
1889 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1890                                            int page_num)
1891 {
1892         struct scrub_page *spage = sblock->pagev[page_num];
1893
1894         BUG_ON(spage->page == NULL);
1895         if (spage->io_error) {
1896                 void *mapped_buffer = kmap_atomic(spage->page);
1897
1898                 clear_page(mapped_buffer);
1899                 flush_dcache_page(spage->page);
1900                 kunmap_atomic(mapped_buffer);
1901         }
1902         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1903 }
1904
1905 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1906                                     struct scrub_page *spage)
1907 {
1908         struct scrub_bio *sbio;
1909         int ret;
1910
1911         mutex_lock(&sctx->wr_lock);
1912 again:
1913         if (!sctx->wr_curr_bio) {
1914                 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1915                                               GFP_KERNEL);
1916                 if (!sctx->wr_curr_bio) {
1917                         mutex_unlock(&sctx->wr_lock);
1918                         return -ENOMEM;
1919                 }
1920                 sctx->wr_curr_bio->sctx = sctx;
1921                 sctx->wr_curr_bio->page_count = 0;
1922         }
1923         sbio = sctx->wr_curr_bio;
1924         if (sbio->page_count == 0) {
1925                 struct bio *bio;
1926
1927                 sbio->physical = spage->physical_for_dev_replace;
1928                 sbio->logical = spage->logical;
1929                 sbio->dev = sctx->wr_tgtdev;
1930                 bio = sbio->bio;
1931                 if (!bio) {
1932                         bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
1933                         sbio->bio = bio;
1934                 }
1935
1936                 bio->bi_private = sbio;
1937                 bio->bi_end_io = scrub_wr_bio_end_io;
1938                 bio_set_dev(bio, sbio->dev->bdev);
1939                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1940                 bio->bi_opf = REQ_OP_WRITE;
1941                 sbio->status = 0;
1942         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1943                    spage->physical_for_dev_replace ||
1944                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1945                    spage->logical) {
1946                 scrub_wr_submit(sctx);
1947                 goto again;
1948         }
1949
1950         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1951         if (ret != PAGE_SIZE) {
1952                 if (sbio->page_count < 1) {
1953                         bio_put(sbio->bio);
1954                         sbio->bio = NULL;
1955                         mutex_unlock(&sctx->wr_lock);
1956                         return -EIO;
1957                 }
1958                 scrub_wr_submit(sctx);
1959                 goto again;
1960         }
1961
1962         sbio->pagev[sbio->page_count] = spage;
1963         scrub_page_get(spage);
1964         sbio->page_count++;
1965         if (sbio->page_count == sctx->pages_per_wr_bio)
1966                 scrub_wr_submit(sctx);
1967         mutex_unlock(&sctx->wr_lock);
1968
1969         return 0;
1970 }
1971
1972 static void scrub_wr_submit(struct scrub_ctx *sctx)
1973 {
1974         struct scrub_bio *sbio;
1975
1976         if (!sctx->wr_curr_bio)
1977                 return;
1978
1979         sbio = sctx->wr_curr_bio;
1980         sctx->wr_curr_bio = NULL;
1981         WARN_ON(!sbio->bio->bi_disk);
1982         scrub_pending_bio_inc(sctx);
1983         /* process all writes in a single worker thread. Then the block layer
1984          * orders the requests before sending them to the driver which
1985          * doubled the write performance on spinning disks when measured
1986          * with Linux 3.5 */
1987         btrfsic_submit_bio(sbio->bio);
1988 }
1989
1990 static void scrub_wr_bio_end_io(struct bio *bio)
1991 {
1992         struct scrub_bio *sbio = bio->bi_private;
1993         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1994
1995         sbio->status = bio->bi_status;
1996         sbio->bio = bio;
1997
1998         btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1999                          scrub_wr_bio_end_io_worker, NULL, NULL);
2000         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
2001 }
2002
2003 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
2004 {
2005         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2006         struct scrub_ctx *sctx = sbio->sctx;
2007         int i;
2008
2009         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
2010         if (sbio->status) {
2011                 struct btrfs_dev_replace *dev_replace =
2012                         &sbio->sctx->fs_info->dev_replace;
2013
2014                 for (i = 0; i < sbio->page_count; i++) {
2015                         struct scrub_page *spage = sbio->pagev[i];
2016
2017                         spage->io_error = 1;
2018                         btrfs_dev_replace_stats_inc(&dev_replace->
2019                                                     num_write_errors);
2020                 }
2021         }
2022
2023         for (i = 0; i < sbio->page_count; i++)
2024                 scrub_page_put(sbio->pagev[i]);
2025
2026         bio_put(sbio->bio);
2027         kfree(sbio);
2028         scrub_pending_bio_dec(sctx);
2029 }
2030
2031 static int scrub_checksum(struct scrub_block *sblock)
2032 {
2033         u64 flags;
2034         int ret;
2035
2036         /*
2037          * No need to initialize these stats currently,
2038          * because this function only use return value
2039          * instead of these stats value.
2040          *
2041          * Todo:
2042          * always use stats
2043          */
2044         sblock->header_error = 0;
2045         sblock->generation_error = 0;
2046         sblock->checksum_error = 0;
2047
2048         WARN_ON(sblock->page_count < 1);
2049         flags = sblock->pagev[0]->flags;
2050         ret = 0;
2051         if (flags & BTRFS_EXTENT_FLAG_DATA)
2052                 ret = scrub_checksum_data(sblock);
2053         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
2054                 ret = scrub_checksum_tree_block(sblock);
2055         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
2056                 (void)scrub_checksum_super(sblock);
2057         else
2058                 WARN_ON(1);
2059         if (ret)
2060                 scrub_handle_errored_block(sblock);
2061
2062         return ret;
2063 }
2064
2065 static int scrub_checksum_data(struct scrub_block *sblock)
2066 {
2067         struct scrub_ctx *sctx = sblock->sctx;
2068         u8 csum[BTRFS_CSUM_SIZE];
2069         u8 *on_disk_csum;
2070         struct page *page;
2071         void *buffer;
2072         u32 crc = ~(u32)0;
2073         u64 len;
2074         int index;
2075
2076         BUG_ON(sblock->page_count < 1);
2077         if (!sblock->pagev[0]->have_csum)
2078                 return 0;
2079
2080         on_disk_csum = sblock->pagev[0]->csum;
2081         page = sblock->pagev[0]->page;
2082         buffer = kmap_atomic(page);
2083
2084         len = sctx->fs_info->sectorsize;
2085         index = 0;
2086         for (;;) {
2087                 u64 l = min_t(u64, len, PAGE_SIZE);
2088
2089                 crc = btrfs_csum_data(buffer, crc, l);
2090                 kunmap_atomic(buffer);
2091                 len -= l;
2092                 if (len == 0)
2093                         break;
2094                 index++;
2095                 BUG_ON(index >= sblock->page_count);
2096                 BUG_ON(!sblock->pagev[index]->page);
2097                 page = sblock->pagev[index]->page;
2098                 buffer = kmap_atomic(page);
2099         }
2100
2101         btrfs_csum_final(crc, csum);
2102         if (memcmp(csum, on_disk_csum, sctx->csum_size))
2103                 sblock->checksum_error = 1;
2104
2105         return sblock->checksum_error;
2106 }
2107
2108 static int scrub_checksum_tree_block(struct scrub_block *sblock)
2109 {
2110         struct scrub_ctx *sctx = sblock->sctx;
2111         struct btrfs_header *h;
2112         struct btrfs_fs_info *fs_info = sctx->fs_info;
2113         u8 calculated_csum[BTRFS_CSUM_SIZE];
2114         u8 on_disk_csum[BTRFS_CSUM_SIZE];
2115         struct page *page;
2116         void *mapped_buffer;
2117         u64 mapped_size;
2118         void *p;
2119         u32 crc = ~(u32)0;
2120         u64 len;
2121         int index;
2122
2123         BUG_ON(sblock->page_count < 1);
2124         page = sblock->pagev[0]->page;
2125         mapped_buffer = kmap_atomic(page);
2126         h = (struct btrfs_header *)mapped_buffer;
2127         memcpy(on_disk_csum, h->csum, sctx->csum_size);
2128
2129         /*
2130          * we don't use the getter functions here, as we
2131          * a) don't have an extent buffer and
2132          * b) the page is already kmapped
2133          */
2134         if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
2135                 sblock->header_error = 1;
2136
2137         if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
2138                 sblock->header_error = 1;
2139                 sblock->generation_error = 1;
2140         }
2141
2142         if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
2143                 sblock->header_error = 1;
2144
2145         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
2146                    BTRFS_UUID_SIZE))
2147                 sblock->header_error = 1;
2148
2149         len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
2150         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2151         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2152         index = 0;
2153         for (;;) {
2154                 u64 l = min_t(u64, len, mapped_size);
2155
2156                 crc = btrfs_csum_data(p, crc, l);
2157                 kunmap_atomic(mapped_buffer);
2158                 len -= l;
2159                 if (len == 0)
2160                         break;
2161                 index++;
2162                 BUG_ON(index >= sblock->page_count);
2163                 BUG_ON(!sblock->pagev[index]->page);
2164                 page = sblock->pagev[index]->page;
2165                 mapped_buffer = kmap_atomic(page);
2166                 mapped_size = PAGE_SIZE;
2167                 p = mapped_buffer;
2168         }
2169
2170         btrfs_csum_final(crc, calculated_csum);
2171         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2172                 sblock->checksum_error = 1;
2173
2174         return sblock->header_error || sblock->checksum_error;
2175 }
2176
2177 static int scrub_checksum_super(struct scrub_block *sblock)
2178 {
2179         struct btrfs_super_block *s;
2180         struct scrub_ctx *sctx = sblock->sctx;
2181         u8 calculated_csum[BTRFS_CSUM_SIZE];
2182         u8 on_disk_csum[BTRFS_CSUM_SIZE];
2183         struct page *page;
2184         void *mapped_buffer;
2185         u64 mapped_size;
2186         void *p;
2187         u32 crc = ~(u32)0;
2188         int fail_gen = 0;
2189         int fail_cor = 0;
2190         u64 len;
2191         int index;
2192
2193         BUG_ON(sblock->page_count < 1);
2194         page = sblock->pagev[0]->page;
2195         mapped_buffer = kmap_atomic(page);
2196         s = (struct btrfs_super_block *)mapped_buffer;
2197         memcpy(on_disk_csum, s->csum, sctx->csum_size);
2198
2199         if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
2200                 ++fail_cor;
2201
2202         if (sblock->pagev[0]->generation != btrfs_super_generation(s))
2203                 ++fail_gen;
2204
2205         if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
2206                 ++fail_cor;
2207
2208         len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
2209         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2210         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2211         index = 0;
2212         for (;;) {
2213                 u64 l = min_t(u64, len, mapped_size);
2214
2215                 crc = btrfs_csum_data(p, crc, l);
2216                 kunmap_atomic(mapped_buffer);
2217                 len -= l;
2218                 if (len == 0)
2219                         break;
2220                 index++;
2221                 BUG_ON(index >= sblock->page_count);
2222                 BUG_ON(!sblock->pagev[index]->page);
2223                 page = sblock->pagev[index]->page;
2224                 mapped_buffer = kmap_atomic(page);
2225                 mapped_size = PAGE_SIZE;
2226                 p = mapped_buffer;
2227         }
2228
2229         btrfs_csum_final(crc, calculated_csum);
2230         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2231                 ++fail_cor;
2232
2233         if (fail_cor + fail_gen) {
2234                 /*
2235                  * if we find an error in a super block, we just report it.
2236                  * They will get written with the next transaction commit
2237                  * anyway
2238                  */
2239                 spin_lock(&sctx->stat_lock);
2240                 ++sctx->stat.super_errors;
2241                 spin_unlock(&sctx->stat_lock);
2242                 if (fail_cor)
2243                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2244                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2245                 else
2246                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2247                                 BTRFS_DEV_STAT_GENERATION_ERRS);
2248         }
2249
2250         return fail_cor + fail_gen;
2251 }
2252
2253 static void scrub_block_get(struct scrub_block *sblock)
2254 {
2255         refcount_inc(&sblock->refs);
2256 }
2257
2258 static void scrub_block_put(struct scrub_block *sblock)
2259 {
2260         if (refcount_dec_and_test(&sblock->refs)) {
2261                 int i;
2262
2263                 if (sblock->sparity)
2264                         scrub_parity_put(sblock->sparity);
2265
2266                 for (i = 0; i < sblock->page_count; i++)
2267                         scrub_page_put(sblock->pagev[i]);
2268                 kfree(sblock);
2269         }
2270 }
2271
2272 static void scrub_page_get(struct scrub_page *spage)
2273 {
2274         atomic_inc(&spage->refs);
2275 }
2276
2277 static void scrub_page_put(struct scrub_page *spage)
2278 {
2279         if (atomic_dec_and_test(&spage->refs)) {
2280                 if (spage->page)
2281                         __free_page(spage->page);
2282                 kfree(spage);
2283         }
2284 }
2285
2286 static void scrub_submit(struct scrub_ctx *sctx)
2287 {
2288         struct scrub_bio *sbio;
2289
2290         if (sctx->curr == -1)
2291                 return;
2292
2293         sbio = sctx->bios[sctx->curr];
2294         sctx->curr = -1;
2295         scrub_pending_bio_inc(sctx);
2296         btrfsic_submit_bio(sbio->bio);
2297 }
2298
2299 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2300                                     struct scrub_page *spage)
2301 {
2302         struct scrub_block *sblock = spage->sblock;
2303         struct scrub_bio *sbio;
2304         int ret;
2305
2306 again:
2307         /*
2308          * grab a fresh bio or wait for one to become available
2309          */
2310         while (sctx->curr == -1) {
2311                 spin_lock(&sctx->list_lock);
2312                 sctx->curr = sctx->first_free;
2313                 if (sctx->curr != -1) {
2314                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2315                         sctx->bios[sctx->curr]->next_free = -1;
2316                         sctx->bios[sctx->curr]->page_count = 0;
2317                         spin_unlock(&sctx->list_lock);
2318                 } else {
2319                         spin_unlock(&sctx->list_lock);
2320                         wait_event(sctx->list_wait, sctx->first_free != -1);
2321                 }
2322         }
2323         sbio = sctx->bios[sctx->curr];
2324         if (sbio->page_count == 0) {
2325                 struct bio *bio;
2326
2327                 sbio->physical = spage->physical;
2328                 sbio->logical = spage->logical;
2329                 sbio->dev = spage->dev;
2330                 bio = sbio->bio;
2331                 if (!bio) {
2332                         bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
2333                         sbio->bio = bio;
2334                 }
2335
2336                 bio->bi_private = sbio;
2337                 bio->bi_end_io = scrub_bio_end_io;
2338                 bio_set_dev(bio, sbio->dev->bdev);
2339                 bio->bi_iter.bi_sector = sbio->physical >> 9;
2340                 bio->bi_opf = REQ_OP_READ;
2341                 sbio->status = 0;
2342         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2343                    spage->physical ||
2344                    sbio->logical + sbio->page_count * PAGE_SIZE !=
2345                    spage->logical ||
2346                    sbio->dev != spage->dev) {
2347                 scrub_submit(sctx);
2348                 goto again;
2349         }
2350
2351         sbio->pagev[sbio->page_count] = spage;
2352         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2353         if (ret != PAGE_SIZE) {
2354                 if (sbio->page_count < 1) {
2355                         bio_put(sbio->bio);
2356                         sbio->bio = NULL;
2357                         return -EIO;
2358                 }
2359                 scrub_submit(sctx);
2360                 goto again;
2361         }
2362
2363         scrub_block_get(sblock); /* one for the page added to the bio */
2364         atomic_inc(&sblock->outstanding_pages);
2365         sbio->page_count++;
2366         if (sbio->page_count == sctx->pages_per_rd_bio)
2367                 scrub_submit(sctx);
2368
2369         return 0;
2370 }
2371
2372 static void scrub_missing_raid56_end_io(struct bio *bio)
2373 {
2374         struct scrub_block *sblock = bio->bi_private;
2375         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2376
2377         if (bio->bi_status)
2378                 sblock->no_io_error_seen = 0;
2379
2380         bio_put(bio);
2381
2382         btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2383 }
2384
2385 static void scrub_missing_raid56_worker(struct btrfs_work *work)
2386 {
2387         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2388         struct scrub_ctx *sctx = sblock->sctx;
2389         struct btrfs_fs_info *fs_info = sctx->fs_info;
2390         u64 logical;
2391         struct btrfs_device *dev;
2392
2393         logical = sblock->pagev[0]->logical;
2394         dev = sblock->pagev[0]->dev;
2395
2396         if (sblock->no_io_error_seen)
2397                 scrub_recheck_block_checksum(sblock);
2398
2399         if (!sblock->no_io_error_seen) {
2400                 spin_lock(&sctx->stat_lock);
2401                 sctx->stat.read_errors++;
2402                 spin_unlock(&sctx->stat_lock);
2403                 btrfs_err_rl_in_rcu(fs_info,
2404                         "IO error rebuilding logical %llu for dev %s",
2405                         logical, rcu_str_deref(dev->name));
2406         } else if (sblock->header_error || sblock->checksum_error) {
2407                 spin_lock(&sctx->stat_lock);
2408                 sctx->stat.uncorrectable_errors++;
2409                 spin_unlock(&sctx->stat_lock);
2410                 btrfs_err_rl_in_rcu(fs_info,
2411                         "failed to rebuild valid logical %llu for dev %s",
2412                         logical, rcu_str_deref(dev->name));
2413         } else {
2414                 scrub_write_block_to_dev_replace(sblock);
2415         }
2416
2417         scrub_block_put(sblock);
2418
2419         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2420                 mutex_lock(&sctx->wr_lock);
2421                 scrub_wr_submit(sctx);
2422                 mutex_unlock(&sctx->wr_lock);
2423         }
2424
2425         scrub_pending_bio_dec(sctx);
2426 }
2427
2428 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2429 {
2430         struct scrub_ctx *sctx = sblock->sctx;
2431         struct btrfs_fs_info *fs_info = sctx->fs_info;
2432         u64 length = sblock->page_count * PAGE_SIZE;
2433         u64 logical = sblock->pagev[0]->logical;
2434         struct btrfs_bio *bbio = NULL;
2435         struct bio *bio;
2436         struct btrfs_raid_bio *rbio;
2437         int ret;
2438         int i;
2439
2440         btrfs_bio_counter_inc_blocked(fs_info);
2441         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2442                         &length, &bbio);
2443         if (ret || !bbio || !bbio->raid_map)
2444                 goto bbio_out;
2445
2446         if (WARN_ON(!sctx->is_dev_replace ||
2447                     !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2448                 /*
2449                  * We shouldn't be scrubbing a missing device. Even for dev
2450                  * replace, we should only get here for RAID 5/6. We either
2451                  * managed to mount something with no mirrors remaining or
2452                  * there's a bug in scrub_remap_extent()/btrfs_map_block().
2453                  */
2454                 goto bbio_out;
2455         }
2456
2457         bio = btrfs_io_bio_alloc(0);
2458         bio->bi_iter.bi_sector = logical >> 9;
2459         bio->bi_private = sblock;
2460         bio->bi_end_io = scrub_missing_raid56_end_io;
2461
2462         rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
2463         if (!rbio)
2464                 goto rbio_out;
2465
2466         for (i = 0; i < sblock->page_count; i++) {
2467                 struct scrub_page *spage = sblock->pagev[i];
2468
2469                 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2470         }
2471
2472         btrfs_init_work(&sblock->work, btrfs_scrub_helper,
2473                         scrub_missing_raid56_worker, NULL, NULL);
2474         scrub_block_get(sblock);
2475         scrub_pending_bio_inc(sctx);
2476         raid56_submit_missing_rbio(rbio);
2477         return;
2478
2479 rbio_out:
2480         bio_put(bio);
2481 bbio_out:
2482         btrfs_bio_counter_dec(fs_info);
2483         btrfs_put_bbio(bbio);
2484         spin_lock(&sctx->stat_lock);
2485         sctx->stat.malloc_errors++;
2486         spin_unlock(&sctx->stat_lock);
2487 }
2488
2489 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2490                        u64 physical, struct btrfs_device *dev, u64 flags,
2491                        u64 gen, int mirror_num, u8 *csum, int force,
2492                        u64 physical_for_dev_replace)
2493 {
2494         struct scrub_block *sblock;
2495         int index;
2496
2497         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2498         if (!sblock) {
2499                 spin_lock(&sctx->stat_lock);
2500                 sctx->stat.malloc_errors++;
2501                 spin_unlock(&sctx->stat_lock);
2502                 return -ENOMEM;
2503         }
2504
2505         /* one ref inside this function, plus one for each page added to
2506          * a bio later on */
2507         refcount_set(&sblock->refs, 1);
2508         sblock->sctx = sctx;
2509         sblock->no_io_error_seen = 1;
2510
2511         for (index = 0; len > 0; index++) {
2512                 struct scrub_page *spage;
2513                 u64 l = min_t(u64, len, PAGE_SIZE);
2514
2515                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2516                 if (!spage) {
2517 leave_nomem:
2518                         spin_lock(&sctx->stat_lock);
2519                         sctx->stat.malloc_errors++;
2520                         spin_unlock(&sctx->stat_lock);
2521                         scrub_block_put(sblock);
2522                         return -ENOMEM;
2523                 }
2524                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2525                 scrub_page_get(spage);
2526                 sblock->pagev[index] = spage;
2527                 spage->sblock = sblock;
2528                 spage->dev = dev;
2529                 spage->flags = flags;
2530                 spage->generation = gen;
2531                 spage->logical = logical;
2532                 spage->physical = physical;
2533                 spage->physical_for_dev_replace = physical_for_dev_replace;
2534                 spage->mirror_num = mirror_num;
2535                 if (csum) {
2536                         spage->have_csum = 1;
2537                         memcpy(spage->csum, csum, sctx->csum_size);
2538                 } else {
2539                         spage->have_csum = 0;
2540                 }
2541                 sblock->page_count++;
2542                 spage->page = alloc_page(GFP_KERNEL);
2543                 if (!spage->page)
2544                         goto leave_nomem;
2545                 len -= l;
2546                 logical += l;
2547                 physical += l;
2548                 physical_for_dev_replace += l;
2549         }
2550
2551         WARN_ON(sblock->page_count == 0);
2552         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2553                 /*
2554                  * This case should only be hit for RAID 5/6 device replace. See
2555                  * the comment in scrub_missing_raid56_pages() for details.
2556                  */
2557                 scrub_missing_raid56_pages(sblock);
2558         } else {
2559                 for (index = 0; index < sblock->page_count; index++) {
2560                         struct scrub_page *spage = sblock->pagev[index];
2561                         int ret;
2562
2563                         ret = scrub_add_page_to_rd_bio(sctx, spage);
2564                         if (ret) {
2565                                 scrub_block_put(sblock);
2566                                 return ret;
2567                         }
2568                 }
2569
2570                 if (force)
2571                         scrub_submit(sctx);
2572         }
2573
2574         /* last one frees, either here or in bio completion for last page */
2575         scrub_block_put(sblock);
2576         return 0;
2577 }
2578
2579 static void scrub_bio_end_io(struct bio *bio)
2580 {
2581         struct scrub_bio *sbio = bio->bi_private;
2582         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2583
2584         sbio->status = bio->bi_status;
2585         sbio->bio = bio;
2586
2587         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2588 }
2589
2590 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2591 {
2592         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2593         struct scrub_ctx *sctx = sbio->sctx;
2594         int i;
2595
2596         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2597         if (sbio->status) {
2598                 for (i = 0; i < sbio->page_count; i++) {
2599                         struct scrub_page *spage = sbio->pagev[i];
2600
2601                         spage->io_error = 1;
2602                         spage->sblock->no_io_error_seen = 0;
2603                 }
2604         }
2605
2606         /* now complete the scrub_block items that have all pages completed */
2607         for (i = 0; i < sbio->page_count; i++) {
2608                 struct scrub_page *spage = sbio->pagev[i];
2609                 struct scrub_block *sblock = spage->sblock;
2610
2611                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2612                         scrub_block_complete(sblock);
2613                 scrub_block_put(sblock);
2614         }
2615
2616         bio_put(sbio->bio);
2617         sbio->bio = NULL;
2618         spin_lock(&sctx->list_lock);
2619         sbio->next_free = sctx->first_free;
2620         sctx->first_free = sbio->index;
2621         spin_unlock(&sctx->list_lock);
2622
2623         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2624                 mutex_lock(&sctx->wr_lock);
2625                 scrub_wr_submit(sctx);
2626                 mutex_unlock(&sctx->wr_lock);
2627         }
2628
2629         scrub_pending_bio_dec(sctx);
2630 }
2631
2632 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2633                                        unsigned long *bitmap,
2634                                        u64 start, u64 len)
2635 {
2636         u64 offset;
2637         u64 nsectors64;
2638         u32 nsectors;
2639         int sectorsize = sparity->sctx->fs_info->sectorsize;
2640
2641         if (len >= sparity->stripe_len) {
2642                 bitmap_set(bitmap, 0, sparity->nsectors);
2643                 return;
2644         }
2645
2646         start -= sparity->logic_start;
2647         start = div64_u64_rem(start, sparity->stripe_len, &offset);
2648         offset = div_u64(offset, sectorsize);
2649         nsectors64 = div_u64(len, sectorsize);
2650
2651         ASSERT(nsectors64 < UINT_MAX);
2652         nsectors = (u32)nsectors64;
2653
2654         if (offset + nsectors <= sparity->nsectors) {
2655                 bitmap_set(bitmap, offset, nsectors);
2656                 return;
2657         }
2658
2659         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2660         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2661 }
2662
2663 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2664                                                    u64 start, u64 len)
2665 {
2666         __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2667 }
2668
2669 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2670                                                   u64 start, u64 len)
2671 {
2672         __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2673 }
2674
2675 static void scrub_block_complete(struct scrub_block *sblock)
2676 {
2677         int corrupted = 0;
2678
2679         if (!sblock->no_io_error_seen) {
2680                 corrupted = 1;
2681                 scrub_handle_errored_block(sblock);
2682         } else {
2683                 /*
2684                  * if has checksum error, write via repair mechanism in
2685                  * dev replace case, otherwise write here in dev replace
2686                  * case.
2687                  */
2688                 corrupted = scrub_checksum(sblock);
2689                 if (!corrupted && sblock->sctx->is_dev_replace)
2690                         scrub_write_block_to_dev_replace(sblock);
2691         }
2692
2693         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2694                 u64 start = sblock->pagev[0]->logical;
2695                 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2696                           PAGE_SIZE;
2697
2698                 scrub_parity_mark_sectors_error(sblock->sparity,
2699                                                 start, end - start);
2700         }
2701 }
2702
2703 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2704 {
2705         struct btrfs_ordered_sum *sum = NULL;
2706         unsigned long index;
2707         unsigned long num_sectors;
2708
2709         while (!list_empty(&sctx->csum_list)) {
2710                 sum = list_first_entry(&sctx->csum_list,
2711                                        struct btrfs_ordered_sum, list);
2712                 if (sum->bytenr > logical)
2713                         return 0;
2714                 if (sum->bytenr + sum->len > logical)
2715                         break;
2716
2717                 ++sctx->stat.csum_discards;
2718                 list_del(&sum->list);
2719                 kfree(sum);
2720                 sum = NULL;
2721         }
2722         if (!sum)
2723                 return 0;
2724
2725         index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize);
2726         ASSERT(index < UINT_MAX);
2727
2728         num_sectors = sum->len / sctx->fs_info->sectorsize;
2729         memcpy(csum, sum->sums + index, sctx->csum_size);
2730         if (index == num_sectors - 1) {
2731                 list_del(&sum->list);
2732                 kfree(sum);
2733         }
2734         return 1;
2735 }
2736
2737 /* scrub extent tries to collect up to 64 kB for each bio */
2738 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2739                         u64 logical, u64 len,
2740                         u64 physical, struct btrfs_device *dev, u64 flags,
2741                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2742 {
2743         int ret;
2744         u8 csum[BTRFS_CSUM_SIZE];
2745         u32 blocksize;
2746
2747         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2748                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2749                         blocksize = map->stripe_len;
2750                 else
2751                         blocksize = sctx->fs_info->sectorsize;
2752                 spin_lock(&sctx->stat_lock);
2753                 sctx->stat.data_extents_scrubbed++;
2754                 sctx->stat.data_bytes_scrubbed += len;
2755                 spin_unlock(&sctx->stat_lock);
2756         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2757                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2758                         blocksize = map->stripe_len;
2759                 else
2760                         blocksize = sctx->fs_info->nodesize;
2761                 spin_lock(&sctx->stat_lock);
2762                 sctx->stat.tree_extents_scrubbed++;
2763                 sctx->stat.tree_bytes_scrubbed += len;
2764                 spin_unlock(&sctx->stat_lock);
2765         } else {
2766                 blocksize = sctx->fs_info->sectorsize;
2767                 WARN_ON(1);
2768         }
2769
2770         while (len) {
2771                 u64 l = min_t(u64, len, blocksize);
2772                 int have_csum = 0;
2773
2774                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2775                         /* push csums to sbio */
2776                         have_csum = scrub_find_csum(sctx, logical, csum);
2777                         if (have_csum == 0)
2778                                 ++sctx->stat.no_csum;
2779                 }
2780                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2781                                   mirror_num, have_csum ? csum : NULL, 0,
2782                                   physical_for_dev_replace);
2783                 if (ret)
2784                         return ret;
2785                 len -= l;
2786                 logical += l;
2787                 physical += l;
2788                 physical_for_dev_replace += l;
2789         }
2790         return 0;
2791 }
2792
2793 static int scrub_pages_for_parity(struct scrub_parity *sparity,
2794                                   u64 logical, u64 len,
2795                                   u64 physical, struct btrfs_device *dev,
2796                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2797 {
2798         struct scrub_ctx *sctx = sparity->sctx;
2799         struct scrub_block *sblock;
2800         int index;
2801
2802         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2803         if (!sblock) {
2804                 spin_lock(&sctx->stat_lock);
2805                 sctx->stat.malloc_errors++;
2806                 spin_unlock(&sctx->stat_lock);
2807                 return -ENOMEM;
2808         }
2809
2810         /* one ref inside this function, plus one for each page added to
2811          * a bio later on */
2812         refcount_set(&sblock->refs, 1);
2813         sblock->sctx = sctx;
2814         sblock->no_io_error_seen = 1;
2815         sblock->sparity = sparity;
2816         scrub_parity_get(sparity);
2817
2818         for (index = 0; len > 0; index++) {
2819                 struct scrub_page *spage;
2820                 u64 l = min_t(u64, len, PAGE_SIZE);
2821
2822                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2823                 if (!spage) {
2824 leave_nomem:
2825                         spin_lock(&sctx->stat_lock);
2826                         sctx->stat.malloc_errors++;
2827                         spin_unlock(&sctx->stat_lock);
2828                         scrub_block_put(sblock);
2829                         return -ENOMEM;
2830                 }
2831                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2832                 /* For scrub block */
2833                 scrub_page_get(spage);
2834                 sblock->pagev[index] = spage;
2835                 /* For scrub parity */
2836                 scrub_page_get(spage);
2837                 list_add_tail(&spage->list, &sparity->spages);
2838                 spage->sblock = sblock;
2839                 spage->dev = dev;
2840                 spage->flags = flags;
2841                 spage->generation = gen;
2842                 spage->logical = logical;
2843                 spage->physical = physical;
2844                 spage->mirror_num = mirror_num;
2845                 if (csum) {
2846                         spage->have_csum = 1;
2847                         memcpy(spage->csum, csum, sctx->csum_size);
2848                 } else {
2849                         spage->have_csum = 0;
2850                 }
2851                 sblock->page_count++;
2852                 spage->page = alloc_page(GFP_KERNEL);
2853                 if (!spage->page)
2854                         goto leave_nomem;
2855                 len -= l;
2856                 logical += l;
2857                 physical += l;
2858         }
2859
2860         WARN_ON(sblock->page_count == 0);
2861         for (index = 0; index < sblock->page_count; index++) {
2862                 struct scrub_page *spage = sblock->pagev[index];
2863                 int ret;
2864
2865                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2866                 if (ret) {
2867                         scrub_block_put(sblock);
2868                         return ret;
2869                 }
2870         }
2871
2872         /* last one frees, either here or in bio completion for last page */
2873         scrub_block_put(sblock);
2874         return 0;
2875 }
2876
2877 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2878                                    u64 logical, u64 len,
2879                                    u64 physical, struct btrfs_device *dev,
2880                                    u64 flags, u64 gen, int mirror_num)
2881 {
2882         struct scrub_ctx *sctx = sparity->sctx;
2883         int ret;
2884         u8 csum[BTRFS_CSUM_SIZE];
2885         u32 blocksize;
2886
2887         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2888                 scrub_parity_mark_sectors_error(sparity, logical, len);
2889                 return 0;
2890         }
2891
2892         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2893                 blocksize = sparity->stripe_len;
2894         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2895                 blocksize = sparity->stripe_len;
2896         } else {
2897                 blocksize = sctx->fs_info->sectorsize;
2898                 WARN_ON(1);
2899         }
2900
2901         while (len) {
2902                 u64 l = min_t(u64, len, blocksize);
2903                 int have_csum = 0;
2904
2905                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2906                         /* push csums to sbio */
2907                         have_csum = scrub_find_csum(sctx, logical, csum);
2908                         if (have_csum == 0)
2909                                 goto skip;
2910                 }
2911                 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2912                                              flags, gen, mirror_num,
2913                                              have_csum ? csum : NULL);
2914                 if (ret)
2915                         return ret;
2916 skip:
2917                 len -= l;
2918                 logical += l;
2919                 physical += l;
2920         }
2921         return 0;
2922 }
2923
2924 /*
2925  * Given a physical address, this will calculate it's
2926  * logical offset. if this is a parity stripe, it will return
2927  * the most left data stripe's logical offset.
2928  *
2929  * return 0 if it is a data stripe, 1 means parity stripe.
2930  */
2931 static int get_raid56_logic_offset(u64 physical, int num,
2932                                    struct map_lookup *map, u64 *offset,
2933                                    u64 *stripe_start)
2934 {
2935         int i;
2936         int j = 0;
2937         u64 stripe_nr;
2938         u64 last_offset;
2939         u32 stripe_index;
2940         u32 rot;
2941
2942         last_offset = (physical - map->stripes[num].physical) *
2943                       nr_data_stripes(map);
2944         if (stripe_start)
2945                 *stripe_start = last_offset;
2946
2947         *offset = last_offset;
2948         for (i = 0; i < nr_data_stripes(map); i++) {
2949                 *offset = last_offset + i * map->stripe_len;
2950
2951                 stripe_nr = div64_u64(*offset, map->stripe_len);
2952                 stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
2953
2954                 /* Work out the disk rotation on this stripe-set */
2955                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2956                 /* calculate which stripe this data locates */
2957                 rot += i;
2958                 stripe_index = rot % map->num_stripes;
2959                 if (stripe_index == num)
2960                         return 0;
2961                 if (stripe_index < num)
2962                         j++;
2963         }
2964         *offset = last_offset + j * map->stripe_len;
2965         return 1;
2966 }
2967
2968 static void scrub_free_parity(struct scrub_parity *sparity)
2969 {
2970         struct scrub_ctx *sctx = sparity->sctx;
2971         struct scrub_page *curr, *next;
2972         int nbits;
2973
2974         nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2975         if (nbits) {
2976                 spin_lock(&sctx->stat_lock);
2977                 sctx->stat.read_errors += nbits;
2978                 sctx->stat.uncorrectable_errors += nbits;
2979                 spin_unlock(&sctx->stat_lock);
2980         }
2981
2982         list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2983                 list_del_init(&curr->list);
2984                 scrub_page_put(curr);
2985         }
2986
2987         kfree(sparity);
2988 }
2989
2990 static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2991 {
2992         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2993                                                     work);
2994         struct scrub_ctx *sctx = sparity->sctx;
2995
2996         scrub_free_parity(sparity);
2997         scrub_pending_bio_dec(sctx);
2998 }
2999
3000 static void scrub_parity_bio_endio(struct bio *bio)
3001 {
3002         struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
3003         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
3004
3005         if (bio->bi_status)
3006                 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
3007                           sparity->nsectors);
3008
3009         bio_put(bio);
3010
3011         btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
3012                         scrub_parity_bio_endio_worker, NULL, NULL);
3013         btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
3014 }
3015
3016 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
3017 {
3018         struct scrub_ctx *sctx = sparity->sctx;
3019         struct btrfs_fs_info *fs_info = sctx->fs_info;
3020         struct bio *bio;
3021         struct btrfs_raid_bio *rbio;
3022         struct btrfs_bio *bbio = NULL;
3023         u64 length;
3024         int ret;
3025
3026         if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
3027                            sparity->nsectors))
3028                 goto out;
3029
3030         length = sparity->logic_end - sparity->logic_start;
3031
3032         btrfs_bio_counter_inc_blocked(fs_info);
3033         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
3034                                &length, &bbio);
3035         if (ret || !bbio || !bbio->raid_map)
3036                 goto bbio_out;
3037
3038         bio = btrfs_io_bio_alloc(0);
3039         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
3040         bio->bi_private = sparity;
3041         bio->bi_end_io = scrub_parity_bio_endio;
3042
3043         rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
3044                                               length, sparity->scrub_dev,
3045                                               sparity->dbitmap,
3046                                               sparity->nsectors);
3047         if (!rbio)
3048                 goto rbio_out;
3049
3050         scrub_pending_bio_inc(sctx);
3051         raid56_parity_submit_scrub_rbio(rbio);
3052         return;
3053
3054 rbio_out:
3055         bio_put(bio);
3056 bbio_out:
3057         btrfs_bio_counter_dec(fs_info);
3058         btrfs_put_bbio(bbio);
3059         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
3060                   sparity->nsectors);
3061         spin_lock(&sctx->stat_lock);
3062         sctx->stat.malloc_errors++;
3063         spin_unlock(&sctx->stat_lock);
3064 out:
3065         scrub_free_parity(sparity);
3066 }
3067
3068 static inline int scrub_calc_parity_bitmap_len(int nsectors)
3069 {
3070         return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
3071 }
3072
3073 static void scrub_parity_get(struct scrub_parity *sparity)
3074 {
3075         refcount_inc(&sparity->refs);
3076 }
3077
3078 static void scrub_parity_put(struct scrub_parity *sparity)
3079 {
3080         if (!refcount_dec_and_test(&sparity->refs))
3081                 return;
3082
3083         scrub_parity_check_and_repair(sparity);
3084 }
3085
3086 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3087                                                   struct map_lookup *map,
3088                                                   struct btrfs_device *sdev,
3089                                                   struct btrfs_path *path,
3090                                                   u64 logic_start,
3091                                                   u64 logic_end)
3092 {
3093         struct btrfs_fs_info *fs_info = sctx->fs_info;
3094         struct btrfs_root *root = fs_info->extent_root;
3095         struct btrfs_root *csum_root = fs_info->csum_root;
3096         struct btrfs_extent_item *extent;
3097         struct btrfs_bio *bbio = NULL;
3098         u64 flags;
3099         int ret;
3100         int slot;
3101         struct extent_buffer *l;
3102         struct btrfs_key key;
3103         u64 generation;
3104         u64 extent_logical;
3105         u64 extent_physical;
3106         u64 extent_len;
3107         u64 mapped_length;
3108         struct btrfs_device *extent_dev;
3109         struct scrub_parity *sparity;
3110         int nsectors;
3111         int bitmap_len;
3112         int extent_mirror_num;
3113         int stop_loop = 0;
3114
3115         nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
3116         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
3117         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
3118                           GFP_NOFS);
3119         if (!sparity) {
3120                 spin_lock(&sctx->stat_lock);
3121                 sctx->stat.malloc_errors++;
3122                 spin_unlock(&sctx->stat_lock);
3123                 return -ENOMEM;
3124         }
3125
3126         sparity->stripe_len = map->stripe_len;
3127         sparity->nsectors = nsectors;
3128         sparity->sctx = sctx;
3129         sparity->scrub_dev = sdev;
3130         sparity->logic_start = logic_start;
3131         sparity->logic_end = logic_end;
3132         refcount_set(&sparity->refs, 1);
3133         INIT_LIST_HEAD(&sparity->spages);
3134         sparity->dbitmap = sparity->bitmap;
3135         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
3136
3137         ret = 0;
3138         while (logic_start < logic_end) {
3139                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3140                         key.type = BTRFS_METADATA_ITEM_KEY;
3141                 else
3142                         key.type = BTRFS_EXTENT_ITEM_KEY;
3143                 key.objectid = logic_start;
3144                 key.offset = (u64)-1;
3145
3146                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3147                 if (ret < 0)
3148                         goto out;
3149
3150                 if (ret > 0) {
3151                         ret = btrfs_previous_extent_item(root, path, 0);
3152                         if (ret < 0)
3153                                 goto out;
3154                         if (ret > 0) {
3155                                 btrfs_release_path(path);
3156                                 ret = btrfs_search_slot(NULL, root, &key,
3157                                                         path, 0, 0);
3158                                 if (ret < 0)
3159                                         goto out;
3160                         }
3161                 }
3162
3163                 stop_loop = 0;
3164                 while (1) {
3165                         u64 bytes;
3166
3167                         l = path->nodes[0];
3168                         slot = path->slots[0];
3169                         if (slot >= btrfs_header_nritems(l)) {
3170                                 ret = btrfs_next_leaf(root, path);
3171                                 if (ret == 0)
3172                                         continue;
3173                                 if (ret < 0)
3174                                         goto out;
3175
3176                                 stop_loop = 1;
3177                                 break;
3178                         }
3179                         btrfs_item_key_to_cpu(l, &key, slot);
3180
3181                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3182                             key.type != BTRFS_METADATA_ITEM_KEY)
3183                                 goto next;
3184
3185                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3186                                 bytes = fs_info->nodesize;
3187                         else
3188                                 bytes = key.offset;
3189